Lucene field indexer written in perl

commit: 2db9ad0d484214d641313937810225a635a997a4 [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Tue Oct 29 19:26:43 2013 +0000
committer: Nils Diewald <nils@diewald-online.de> Tue Oct 29 19:26:43 2013 +0000
tree: 46f404294a6ec09faf4d1a551a3a03599473dd61
diff --git a/Makefile.PL b/Makefile.PL
new file mode 100644
index 0000000..b076284
--- /dev/null
+++ b/Makefile.PL

@@ -0,0 +1,27 @@
+#!/usr/bin/env perl
+use v5.16;
+use strict;
+use warnings;
+use ExtUtils::MakeMaker;
+
+WriteMakefile(
+  NAME         => 'KorAP::Bundle',
+  AUTHOR       => 'Nils Diewald',
+  ABSTRACT     => 'Perl Implementation for Generating Multifoundry Lucene Indices',
+  VERSION_FROM => 'lib/KorAP/Bundle.pm',
+  PREREQ_PM => {
+    'Mojolicious'   => 4.51,
+    'Packed::Array' => 0.01,
+    'Log::Log4perl' => 1.42,
+    'Carp'         => 0,
+    'strict'       => 0,
+    'warnings'     => 0,
+    'utf8'         => 0,
+    'bytes'        => 0
+  },
+#  LICENSE      => 'perl',
+  MIN_PERL_VERSION => '5.016',
+  test => {
+    TESTS => 't/*.t'
+  }
+);

diff --git a/lib/KorAP/Bundle.pm b/lib/KorAP/Bundle.pm
new file mode 100644
index 0000000..43997e3
--- /dev/null
+++ b/lib/KorAP/Bundle.pm

@@ -0,0 +1,5 @@
+package KorAP::Bundle;
+
+our $VERSION = 0.01;
+
+1;

diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
new file mode 100644
index 0000000..8f80e6c
--- /dev/null
+++ b/lib/KorAP/Document.pm

@@ -0,0 +1,250 @@
+package KorAP::Document;
+use Mojo::Base -base;
+use v5.16;
+
+use Mojo::ByteStream 'b';
+use Mojo::DOM;
+use Carp qw/croak carp/;
+use KorAP::Document::Primary;
+
+has [qw/id corpus_id path/];
+has [qw/pub_date title sub_title pub_place/];
+
+# parse document
+sub parse {
+  my $self = shift;
+  my $file = b($self->path . 'data.xml')->slurp;
+
+  state $unable = 'Unable to parse document';
+
+  carp 'Parse document ' . $self->path;
+
+  my $dom = Mojo::DOM->new($file);
+
+  my $rt = $dom->at('raw_text');
+
+  # Get document id and corpus id
+  if ($rt && $rt->attr('docid')) {
+    $self->id($rt->attr('docid'));
+    if ($self->id =~ /^([^_]+)_/) {
+      $self->corpus_id($1);
+    }
+    else {
+      croak $unable;
+    };
+  }
+  else {
+    croak $unable;
+  };
+
+  # Get primary data
+  my $pd = $rt->at('text');
+  if ($pd) {
+
+    $pd = b($pd->text)->decode;
+    $self->{pd} = KorAP::Document::Primary->new($pd->to_string);
+  }
+  else {
+    croak $unable;
+  };
+
+  # Get meta data
+  $self->_parse_meta;
+  return 1;
+};
+
+
+# Primary data
+sub primary {
+  $_[0]->{pd};
+};
+
+sub author {
+  my $self = shift;
+
+  # Set authors
+  if ($_[0]) {
+    return $self->{authors} = [
+      grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
+    ];
+  }
+  return ($self->{authors} // []);
+};
+
+sub text_class {
+  my $self = shift;
+  if ($_[0]) {
+    return $self->{topics} = [ @_ ];
+  };
+  return ($self->{topics} // []);
+};
+
+
+
+sub _parse_meta {
+  my $self = shift;
+
+  my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1')->encode;
+
+  my $dom = Mojo::DOM->new($file);
+  my $monogr = $dom->at('monogr');
+
+  # Get title
+  my $title = $monogr->at('h\.title[type=main]');
+  $self->title($title->text) if $title;
+
+  # Get Subtitle
+  my $sub_title = $monogr->at('h\.title[type=sub]');
+  $self->sub_title($sub_title->text) if $sub_title;
+
+  # Get Author
+  my $author = $monogr->at('h\.author');
+  $self->author($author->all_text) if $author;
+
+  # Get pubDate
+  my $year = $dom->at("pubDate[type=year]");
+  $year = $year ? $year->text : 0;
+  my $month = $dom->at("pubDate[type=month]");
+  $month = $month ? $month->text : 0;
+  my $day = $dom->at("pubDate[type=day]");
+  $day = $day ? $day->text : 0;
+
+  my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
+  $date .= length($month) == 1 ? '0' . $month : $month;
+  $date .= length($day) == 1 ? '0' . $day : $day;
+
+  $self->pub_date($date);
+
+  # Get textClasses
+  my @topic;
+  $dom->find("textClass catRef")->each(
+    sub {
+      my ($ign, @ttopic) = split('\.', $_->attr('target'));
+      push(@topic, @ttopic);
+    }
+  );
+  $self->text_class(@topic);
+};
+
+
+1;
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+KorAP::Document
+
+
+=head1 SYNOPSIS
+
+  my $doc = KorAP::Document->new(
+    path => 'mydoc-1/'
+  );
+
+  $doc->parse;
+
+  print $doc->title;
+
+
+=head1 DESCRIPTION
+
+Parse the primary and meta data of a document.
+
+
+=head2 ATTRIBUTES
+
+=head2 id
+
+  $doc->id(75476);
+  print $doc->id;
+
+The unique identifier of the document.
+
+
+=head2 corpus_id
+
+  $doc->corpus_id(4);
+  print $doc->corpus_id;
+
+The unique identifier of the corpus.
+
+
+=head2 path
+
+  $doc->path("example-004/");
+  print $doc->path;
+
+The path of the document.
+
+
+=head2 title
+
+  $doc->title("Der Name der Rose");
+  print $doc->title;
+
+The title of the document.
+
+
+=head2 sub_title
+
+  $doc->sub_title("Natürlich eine Handschrift");
+  print $doc->sub_title;
+
+The title of the document.
+
+
+=head2 pub_place
+
+  $doc->pub_place("Rom");
+  print $doc->pub_place;
+
+The publication place of the document.
+
+
+=head2 pub_date
+
+  $doc->pub_place("19800404");
+  print $doc->pub_place;
+
+The publication date of the document,
+in the format "YYYYMMDD".
+
+
+=head2 primary
+
+  print $doc->primary->data(0,20);
+
+The L<KorAP::Document::Primary> object containing the primary data.
+
+
+=head2 author
+
+  $doc->author('Binks, Jar Jar; Luke Skywalker');
+  print $doc->author->[0];
+
+Set the author value as semikolon separated list of names or
+get an array reference of author names.
+
+=head2 text_class
+
+  $doc->text_class(qw/news sports/);
+  print $doc->text_class->[0];
+
+Set the text class as an array or get an array
+reference of text classes.
+
+
+=head1 METHODS
+
+=head2 parse
+
+  $doc->parse;
+
+Run the parsing process of the document
+
+
+=cut

diff --git a/lib/KorAP/Document/Primary.pm b/lib/KorAP/Document/Primary.pm
new file mode 100644
index 0000000..52ca844
--- /dev/null
+++ b/lib/KorAP/Document/Primary.pm

@@ -0,0 +1,153 @@
+package KorAP::Document::Primary;
+use strict;
+use warnings;
+use Carp qw/croak/;
+use Mojo::ByteStream 'b';
+use feature 'state';
+use Packed::Array;
+
+
+# Constructor
+sub new {
+  my $class = shift;
+  bless [shift()], $class;
+};
+
+
+# Get the data as a substring
+sub data {
+  my $self = shift;
+  my ($from, $to) = @_;
+
+  return b(substr($self->[0], $from))->encode if $from && !$to;
+
+  return b($self->[0])->encode unless $to;
+
+  my $substr = substr($self->[0], $from, $to - $from);
+  if ($substr) {
+    return b($substr)->encode;
+  };
+  # encode 'UTF-8',
+  croak 'Unable to find substring';
+};
+
+
+# The length of the primary text in characters
+sub data_length {
+  my $self = shift;
+  return $self->[1] if $self->[1];
+  $self->[1] = length($self->[0]);
+  return $self->[1];
+};
+
+
+# Get correct offset
+sub bytes2chars {
+  my $self = shift;
+  unless ($self->[2]) {
+    $self->_calc_chars;
+  };
+  return $self->[2]->[shift];
+};
+
+
+# Calculate character offsets
+sub _calc_chars {
+  use bytes;
+
+  my $self = shift;
+  tie my @array, 'Packed::Array';
+
+  state $leading = pack( 'B8', '10000000' );
+  state $start   = pack( 'B8', '01000000' );
+
+  my ($i, $j) = (0,0);
+  my $c;
+
+  # Init array
+  my $l = length($self->[0]);
+  $array[$l-1] = 0;
+
+  # Iterate over every character
+  while ($i < $l) {
+
+    # Get actual character
+    $c = substr($self->[0], $i, 1);
+
+    # store character position
+    $array[$i++] = $j;
+
+    # This is the start of a multibyte sequence
+    if (ord($c & $leading) && ord($c & $start)) {
+
+      # Get the next byte - expecting a following character
+      $c = substr($self->[0], $i, 1);
+
+      # Character is part of a multibyte
+      while (ord($c & $leading)) {
+
+	# Set count
+	$array[$i] = (ord($c & $start)) ? ++$j : $j;
+
+	# Get next character
+	$c = substr($self->[0], ++$i, 1);
+      };
+    };
+
+    $j++;
+  };
+
+  $self->[2] = \@array;
+};
+
+
+1;
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+KorAP::Document::Primary
+
+=head1 SYNOPSIS
+
+  my $text = KorAP::Document::Primary('Das ist mein Text');
+  print $text->data(2,5);
+  print $text->data_length;
+
+
+=head1 DESCRIPTION
+
+Represent textual data with annotated character and byte offsets.
+
+
+=head1 ATTRIBUTES
+
+=head2 data_length
+
+  print $text->data_length;
+
+The textual length in number of characters.
+
+
+=head1 METHODS
+
+=head2 data
+
+  print $text->data;
+  print $text->data(4);
+  print $text->data(5,17);
+
+Return the textual data as a substring. Accepts a starting offset and the length of
+the requested data. The data will be wrapped in an utf-8 encoded L<Mojo::ByteStream>.
+
+=head2 bytes2chars
+
+  print $text->bytes2chars(40);
+
+Calculates the character offset based on a given byte offset.
+
+=cut

diff --git a/lib/KorAP/MultiTerm.pm b/lib/KorAP/MultiTerm.pm
new file mode 100644
index 0000000..9dd12a9
--- /dev/null
+++ b/lib/KorAP/MultiTerm.pm

@@ -0,0 +1,37 @@
+package KorAP::MultiTerm;
+use Mojo::Base -base;
+
+has [qw/p_start p_end o_start o_end term payload/];
+has store_offsets => 1;
+
+sub to_string {
+  my $self = shift;
+  my $string = $self->term;
+  if (defined $self->o_start) {
+    $string .= '#' .$self->o_start .'-' . $self->o_end;
+#  }
+#  elsif (!$self->storeOffsets) {
+#    $string .= '#-';
+  };
+
+
+  my $pl = $self->p_end ? $self->p_end - 1 : $self->payload;
+  if ($self->p_end || $self->payload) {
+    $string .= '$';
+    if ($self->p_end) {
+      $string .= '<i>' . $self->p_end;
+    };
+    if ($self->payload) {
+      if (index($self->payload, '<') == 0) {
+	$string .= $self->payload;
+      }
+      else {
+	$string .= '<?>' . $self->payload;
+      };
+    };
+  };
+
+  return $string;
+};
+
+1;

diff --git a/lib/KorAP/MultiTermToken.pm b/lib/KorAP/MultiTermToken.pm
new file mode 100644
index 0000000..9df9811
--- /dev/null
+++ b/lib/KorAP/MultiTermToken.pm

@@ -0,0 +1,34 @@
+package KorAP::MultiTermToken;
+use KorAP::MultiTerm;
+use Mojo::Base -base;
+
+has [qw/o_start o_end/];
+
+sub add {
+  my $self = shift;
+  my $mt;
+  unless (ref $_[0] eq 'MultiTerm') {
+    if (@_ == 1) {
+      $mt = KorAP::MultiTerm->new(term => shift());
+    }
+    else {
+      $mt = KorAP::MultiTerm->new(@_);
+    };
+  }
+  else {
+    $mt = shift;
+  };
+  $self->{mt} //= [];
+  push(@{$self->{mt}}, $mt);
+  return $mt;
+};
+
+sub to_string {
+  my $self = shift;
+  my $string = '[(' . $self->o_start . '-'. $self->o_end . ')';
+  $string .= join ('|', map($_->to_string, @{$self->{mt}}));
+  $string .= ']';
+  return $string;
+};
+
+1;

diff --git a/lib/KorAP/MultiTermTokenStream.pm b/lib/KorAP/MultiTermTokenStream.pm
new file mode 100644
index 0000000..6d7dc29
--- /dev/null
+++ b/lib/KorAP/MultiTermTokenStream.pm

@@ -0,0 +1,33 @@
+package KorAP::MultiTermTokenStream;
+use Mojo::Base -base;
+use KorAP::MultiTermToken;
+
+has [qw/oStart oEnd/];
+
+sub add {
+  my $self = shift;
+  my $mtt = shift // KorAP::MultiTermToken->new;
+  $self->{mtt} //= [];
+  push(@{$self->{mtt}}, $mtt);
+  return $mtt;
+};
+
+sub add_meta {
+  my $self = shift;
+  my $mt = $self->pos(0)->add('-:' . shift);
+  $mt->payload(shift);
+  $mt->store_offsets(0);
+};
+
+sub pos {
+  my $self = shift;
+  my $pos = shift;
+  return $self->{mtt}->[$pos];
+};
+
+sub to_string {
+  my $self = shift;
+  return join("\n" , map { $_->to_string } @{$self->{mtt}}) . "\n";
+};
+
+1;

diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
new file mode 100644
index 0000000..7038cc4
--- /dev/null
+++ b/lib/KorAP/Tokenizer.pm

@@ -0,0 +1,362 @@
+package KorAP::Tokenizer;
+
+use Mojo::Base -base;
+use Mojo::ByteStream 'b';
+use Carp qw/carp croak/;
+use KorAP::Tokenizer::Range;
+use KorAP::Tokenizer::Match;
+use KorAP::Tokenizer::Spans;
+use KorAP::Tokenizer::Tokens;
+use KorAP::MultiTermTokenStream;
+use Log::Log4perl;
+
+has [qw/path foundry layer doc stream should have/];
+
+has 'log' => sub {
+  Log::Log4perl->get_logger(__PACKAGE__)
+};
+
+# Parse tokens of the document
+sub parse {
+  my $self = shift;
+
+  # Create new token stream
+  my $mtts = KorAP::MultiTermTokenStream->new;
+  my $file = b($self->path . $self->foundry . '/' . ($self->layer // 'tokens') . '.xml')->slurp;
+  my $tokens = Mojo::DOM->new($file);
+  $tokens->xml(1);
+
+  my $doc = $self->doc;
+
+  my ($should, $have) = (0, 0);
+
+  # Create range and match objects
+  my $range = KorAP::Tokenizer::Range->new;
+  my $match = KorAP::Tokenizer::Match->new;
+
+  my $old = 0;
+
+  $self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer);
+
+  # Iterate over all tokens
+  $tokens->find('span')->each(
+    sub {
+      my $span = $_;
+      my $from = $span->attr('from');
+      my $to = $span->attr('to');
+      my $token = $doc->primary->data($from, $to);
+
+      $should++;
+
+      # Ignore non-word tokens
+      return if $token !~ /[\w\d]/;
+
+      my $mtt = $mtts->add;
+
+      # Add gap for later finding matching positions before or after
+      $range->gap($old, $from, $have) unless $old >= $from;
+
+      # Add surface term
+      $mtt->add('s:' . $token);
+
+      # Add case insensitive term
+      $mtt->add('i:' . lc $token);
+
+      # Add offset information
+      $mtt->o_start($from);
+      $mtt->o_end($to);
+
+      # Store offset information for position matching
+      $range->set($from, $to, $have);
+      $match->set($from, $to, $have);
+
+      $old = $to + 1;
+
+      # Add position term
+      $mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end);
+
+      $have++;
+    });
+
+  # Add token count
+  $mtts->add_meta('t', '<i>' . $have);
+
+  $range->gap($old, $doc->primary->data_length, $have-1) if $doc->primary->data_length >= $old;
+
+  # Add info
+  $self->stream($mtts);
+  $self->{range} = $range;
+  $self->{match} = $match;
+  $self->should($should);
+  $self->have($have);
+
+    $self->log->debug('With a non-word quota of ' . _perc($self->should, $self->should - $self->have) . ' %');
+};
+
+
+# Get span positions through character offsets
+sub range {
+  return shift->{range} // KorAP::Tokenizer::Range->new;
+};
+
+
+# Get token positions through character offsets
+sub match {
+  return shift->{match} // KorAP::Tokenizer::Match->new;
+};
+
+
+# Add information of spans to the tokens
+sub add_spandata {
+  my $self = shift;
+  my %param = @_;
+
+  croak 'No token data available' unless $self->stream;
+
+  $self->log->trace(
+    ($param{skip} ? 'Skip' : 'Add').' span data '.$param{foundry}.':'.$param{layer}
+  );
+
+  return if $param{skip};
+
+  my $cb = delete $param{cb};
+
+  if ($param{encoding} && $param{encoding} eq 'bytes') {
+    $param{primary} = $self->doc->primary;
+  };
+
+  my $spans = KorAP::Tokenizer::Spans->new(
+    path => $self->path,
+    range => $self->range,
+    %param
+  );
+
+  my $spanarray = $spans->parse;
+
+  if ($spans->should == $spans->have) {
+    $self->log->trace('With perfect alignment!');
+  }
+  else {
+    $self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %');
+  };
+
+
+  if ($cb) {
+    foreach (@$spanarray) {
+      $cb->($self->stream, $_);
+    };
+    return;
+  };
+  return $spans;
+};
+
+
+# Add information to the tokens
+sub add_tokendata {
+  my $self = shift;
+  my %param = @_;
+
+  croak 'No token data available' unless $self->stream;
+
+  $self->log->trace(
+    ($param{skip} ? 'Skip' : 'Add').' token data '.$param{foundry}.':'.$param{layer}
+  );
+  return if $param{skip};
+
+  my $cb = delete $param{cb};
+
+  if ($param{encoding} && $param{encoding} eq 'bytes') {
+    $param{primary} = $self->doc->primary;
+  };
+
+  my $tokens = KorAP::Tokenizer::Tokens->new(
+    path => $self->path,
+    match => $self->match,
+    %param
+  );
+
+  my $tokenarray = $tokens->parse;
+
+  if ($tokens->should == $tokens->have) {
+    $self->log->trace('With perfect alignment!');
+  }
+  else {
+    my $perc = _perc(
+      $tokens->should, $tokens->have, $self->should, $self->should - $self->have
+    );
+    $self->log->debug('With an alignment quota of ' . $perc);
+  };
+
+  if ($cb) {
+    foreach (@$tokenarray) {
+      $cb->($self->stream, $_);
+    };
+    return;
+  };
+  return $tokens;
+};
+
+
+sub _perc {
+  if (@_ == 2) {
+    # '[' . $_[0] . '/' . $_[1] . ']' .
+    return sprintf("%.2f", ($_[1] * 100) / $_[0]);
+  }
+
+  my $a_should = shift;
+  my $a_have   = shift;
+  my $b_should = shift;
+  my $b_have   = shift;
+  my $a_quota = ($a_have * 100) / $a_should;
+  my $b_quota = ($b_have * 100) / $b_should;
+  return sprintf("%.2f", $a_quota) . '%' .
+    ((($a_quota + $b_quota) <= 100) ?
+       ' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : '');
+};
+
+
+1;
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+KorAP::Tokenizer
+
+=head1 SYNOPSIS
+
+  my $tokens = KorAP::Tokenizer->new(
+    path    => '../examples/00003',
+    doc     => KorAP::Document->new( ... ),
+    foundry => 'opennlp',
+    layer   => 'tokens'
+  );
+
+  $tokens->parse;
+
+=head1 DESCRIPTION
+
+Convert token information from the KorAP XML
+format into Lucene Index compatible token streams.
+
+=head1 ATTRIBUTES
+
+=head2 path
+
+  print $tokens->path;
+
+The path of the document.
+
+
+=head2 foundry
+
+  print $tokens->foundry;
+
+The name of the foundry.
+
+
+=head2 layer
+
+  print $tokens->layer;
+
+The name of the tokens layer.
+
+
+=head2 doc
+
+  print $tokens->doc->corpus_id;
+
+The L<KorAP::Document> object.
+
+
+=head2 stream
+
+  $tokens->stream->add_meta('adjCount', '<i>45');
+
+The L<KorAP::MultiTermTokenStream> object
+
+
+=head2 range
+
+  $tokens->range->lookup(45);
+
+The L<KorAP::Tokenizer::Range> object for converting span offsets to positions.
+
+=head2 match
+
+  $tokens->match->lookup(45);
+
+The L<KorAP::Tokenizer::Match> object for converting token offsets to positions.
+
+
+=head1 METHODS
+
+=head2 parse
+
+  $tokens->parse;
+
+Start the tokenization process.
+
+
+=head2 add_spandata
+
+  $tokens->add_spandata(
+    foundry => 'base',
+    layer => 'sentences',
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term    => '<>:s',
+	o_start => $span->o_start,
+	o_end   => $span->o_end,
+	p_end   => $span->p_end
+      );
+    }
+  );
+
+Add span information to the parsed token stream.
+Expects a C<foundry> name, a C<layer> name and a
+callback parameter, that will be called after each parsed
+span. The L<KorAP::MultiTermTokenStream> object will be passed,
+as well as the current L<KorAP::Tokenizer::Span>.
+
+An optional parameter C<encoding> may indicate that the span offsets
+are either refering to C<bytes> or C<utf-8> offsets.
+
+An optional parameter C<skip> allows for skipping the process.
+
+
+=head2 add_tokendata
+
+  $tokens->add_tokendata(
+    foundry => 'connexor',
+    layer => 'syntax',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      # syntax
+      if ((my $found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'cnx_syn:' . $found
+	);
+      };
+    });
+
+Add token information to the parsed token stream.
+Expects a C<foundry> name, a C<layer> name and a
+callback parameter, that will be called after each parsed
+token. The L<KorAP::MultiTermTokenStream> object will be passed,
+as well as the current L<KorAP::Tokenizer::Span>.
+
+An optional parameter C<encoding> may indicate that the token offsets
+are either refering to C<bytes> or C<utf-8> offsets.
+
+An optional parameter C<skip> allows for skipping the process.
+
+=cut

diff --git a/lib/KorAP/Tokenizer/Match.pm b/lib/KorAP/Tokenizer/Match.pm
new file mode 100644
index 0000000..e5a96f8
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Match.pm

@@ -0,0 +1,17 @@
+package KorAP::Tokenizer::Match;
+use strict;
+use warnings;
+
+sub new {
+  bless {}, shift;
+};
+
+sub set {
+  $_[0]->{$_[1] . ':' . $_[2]} = $_[3];
+};
+
+sub lookup {
+  $_[0]->{$_[1] . ':' . $_[2]} // undef;
+};
+
+1;

diff --git a/lib/KorAP/Tokenizer/Range.pm b/lib/KorAP/Tokenizer/Range.pm
new file mode 100644
index 0000000..c18136b
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Range.pm

@@ -0,0 +1,55 @@
+package KorAP::Tokenizer::Range;
+use strict;
+use warnings;
+use Array::IntSpan;
+
+sub new {
+  my $class = shift;
+  my $range = Array::IntSpan->new;
+  bless \$range, $class;
+};
+
+sub set {
+  my $self = shift;
+  $$self->set_range(@_);
+};
+
+sub gap {
+  my $self = shift;
+  $$self->set_range($_[0], $_[1], '!' . ($_[2] - 1) . ':' . $_[2]);
+};
+
+sub lookup {
+  my $x = ${$_[0]}->lookup( $_[1] ) or return;
+  return if index($x, '!') == 0;
+  return $x;
+};
+
+sub before {
+  my $self = shift;
+  my $offset = shift;
+  my $found = $$self->lookup( $offset );
+  unless (defined $found) {
+    warn 'There is no value for ', $offset;
+  };
+
+  if ($found =~ /!(\d+):(\d+)$/) {
+    return $1 >= 0 ? $1 : 0;
+  }
+  else {
+    return $found;
+  };
+};
+
+sub after {
+  my $self = shift;
+  my $found = $$self->lookup( shift() );
+  if ($found =~ /!(\d+):(\d+)$/) {
+    return $2;
+  }
+  else {
+    return $found;
+  };
+};
+
+1;

diff --git a/lib/KorAP/Tokenizer/Span.pm b/lib/KorAP/Tokenizer/Span.pm
new file mode 100644
index 0000000..0010d7c
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Span.pm

@@ -0,0 +1,70 @@
+package KorAP::Tokenizer::Span;
+use strict;
+use warnings;
+use Mojo::DOM;
+
+sub new {
+  bless [], shift;
+};
+
+sub o_start {
+  if (defined $_[1]) {
+    $_[0]->[0] = $_[1];
+  };
+  $_[0]->[0];
+};
+
+sub o_end {
+  if (defined $_[1]) {
+    $_[0]->[1] = $_[1];
+  };
+  $_[0]->[1];
+};
+
+sub p_start {
+  if (defined $_[1]) {
+    $_[0]->[2] = $_[1];
+  };
+  $_[0]->[2];
+};
+
+sub p_end {
+  if (defined $_[1]) {
+    $_[0]->[3] = $_[1];
+  };
+  $_[0]->[3];
+};
+
+sub id {
+  if (defined $_[1]) {
+    $_[0]->[4] = $_[1];
+  };
+  $_[0]->[4];
+};
+
+
+sub content {
+  if (defined $_[1]) {
+    $_[0]->[5] = $_[1];
+  }
+  else {
+    if ($_[0]->processed) {
+      return $_[0]->[5];
+    }
+    else {
+      my $c = Mojo::DOM->new($_[0]->[5]);
+      $c->xml(1);
+      $_[0]->processed(1);
+      return $_[0]->[5] = $c;
+    };
+  };
+};
+
+sub processed {
+  if (defined $_[1]) {
+    $_[0]->[6] = $_[1] ? 1 : 0;
+  };
+  $_[0]->[6];
+};
+
+1;

diff --git a/lib/KorAP/Tokenizer/Spans.pm b/lib/KorAP/Tokenizer/Spans.pm
new file mode 100644
index 0000000..7e1a382
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Spans.pm

@@ -0,0 +1,63 @@
+package KorAP::Tokenizer::Spans;
+use Mojo::Base -base;
+use KorAP::Tokenizer::Span;
+use Mojo::DOM;
+use Mojo::ByteStream 'b';
+
+has [qw/path foundry layer range primary should have/];
+has 'encoding' => 'utf-8';
+
+sub parse {
+  my $self = shift;
+  my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
+
+  my $spans = Mojo::DOM->new($file);
+  $spans->xml(1);
+
+  my ($should, $have) = (0,0);
+  my ($from, $to);
+
+  my @spans;
+  $spans->find('span')->each(
+    sub {
+      my $s = shift;
+
+      $should++;
+
+      if ($self->encoding eq 'bytes') {
+	$from = $self->primary->bytes2chars($s->attr('from'));
+	$to = $self->primary->bytes2chars($s->attr('to'));
+      }
+      else {
+	$from = $s->attr('from');
+	$to = $s->attr('to');
+      };
+
+      return unless $to > $from;
+
+      my $span = KorAP::Tokenizer::Span->new;
+
+      $span->id($s->attr('id'));
+      $span->o_start($from);
+      $span->o_end($to);
+      $span->p_start($self->range->after($span->o_start));
+      $span->p_end($self->range->before($span->o_end));
+
+      return unless $span->p_end >= $span->p_start;
+
+      if (@{$s->children}) {
+	$span->content($s->content_xml);
+      };
+
+      $have++;
+
+      push(@spans, $span);
+    });
+
+  $self->should($should);
+  $self->have($have);
+
+  return \@spans;
+};
+
+1;

diff --git a/lib/KorAP/Tokenizer/Token.pm b/lib/KorAP/Tokenizer/Token.pm
new file mode 100644
index 0000000..f6c1971
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Token.pm

@@ -0,0 +1,37 @@
+package KorAP::Tokenizer::Token;
+use strict;
+use warnings;
+use Mojo::DOM;
+
+sub new {
+  bless [], shift;
+};
+
+sub pos {
+  if (defined $_[1]) {
+    $_[0]->[0] = $_[1];
+  };
+  $_[0]->[0];
+};
+
+sub content {
+  if ($_[1]) {
+    $_[0]->[1] = $_[1];
+  }
+  else {
+    my $c = Mojo::DOM->new($_[0]->[1]);
+    $c->xml(1);
+    return $c;
+  };
+};
+
+sub id {
+  if ($_[1]) {
+    $_[0]->[2] = $_[1];
+  }
+  else {
+    $_[0]->[2];
+  };
+};
+
+1;

diff --git a/lib/KorAP/Tokenizer/Tokens.pm b/lib/KorAP/Tokenizer/Tokens.pm
new file mode 100644
index 0000000..7657460
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Tokens.pm

@@ -0,0 +1,62 @@
+package KorAP::Tokenizer::Tokens;
+use Mojo::Base -base;
+use Mojo::DOM;
+use Mojo::ByteStream 'b';
+use KorAP::Tokenizer::Token;
+
+has [qw/path foundry layer match primary should have/];
+has 'encoding' => 'utf-8';
+
+sub parse {
+  my $self = shift;
+  my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
+
+  my $spans = Mojo::DOM->new($file);
+  $spans->xml(1);
+
+  my ($should, $have) = (0,0);
+  my ($from, $to);
+
+  my $match = $self->match;
+
+  my @tokens;
+  $spans->find('span')->each(
+    sub {
+      my $s = shift;
+
+      $should++;
+
+      if ($self->encoding eq 'bytes') {
+	$from = $self->primary->bytes2chars($s->attr('from'));
+	$to = $self->primary->bytes2chars($s->attr('to'));
+      }
+      else {
+	$from = $s->attr('from');
+	$to = $s->attr('to');
+      };
+
+      my $pos = $match->lookup($from, $to);
+
+      return unless defined $pos;
+
+      my $token = KorAP::Tokenizer::Token->new;
+      $token->id($s->attr('id'));
+      $token->pos($pos);
+
+      if (@{$s->children}) {
+	$token->content($s->content_xml);
+      };
+
+      $have++;
+
+      push(@tokens, $token);
+    });
+
+  $self->should($should);
+  $self->have($have);
+
+  return \@tokens;
+};
+
+
+1;

diff --git a/script/log4perl.conf b/script/log4perl.conf
new file mode 100644
index 0000000..90459dd
--- /dev/null
+++ b/script/log4perl.conf

@@ -0,0 +1,5 @@
+log4perl.rootLogger=TRACE, STDERR
+
+log4perl.appender.STDERR. = Log::Log4perl::Appender::ScreenColoredLevels
+log4perl.appender.STDERR.layout=PatternLayout
+log4perl.appender.STDERR.layout.ConversionPattern=[%r] %F %L %c - %m%n
\ No newline at end of file

diff --git a/script/prepare_index.pl b/script/prepare_index.pl
new file mode 100644
index 0000000..f0dc398
--- /dev/null
+++ b/script/prepare_index.pl

@@ -0,0 +1,405 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use v5.16;
+use lib 'lib', '../lib';
+use Set::Scalar;
+use Mojo::DOM;
+use Mojo::Util qw/encode decode/;
+use Mojo::ByteStream 'b';
+
+use Log::Log4perl;
+Log::Log4perl->init("script/log4perl.conf");
+
+use KorAP::Document;
+use KorAP::Tokenizer;
+
+
+# Call perl script/prepare_index.pl WPD/AAA/00001
+
+sub parse_doc {
+  my $doc = KorAP::Document->new(
+    path => shift . '/'
+  );
+
+  $doc->parse;
+
+  my $tokens = KorAP::Tokenizer->new(
+    path => $doc->path,
+    doc => $doc,
+    foundry => 'connexor',
+    layer => 'tokens'
+  );
+
+  $tokens->parse;
+
+  my $i = 0;
+  $tokens->add_spandata(
+    foundry => 'connexor',
+    layer => 'sentences',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:s',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  );
+
+  $tokens->stream->add_meta('s', '<i>' . $i);
+
+  $i = 0;
+  $tokens->add_spandata(
+    foundry => 'base',
+    layer => 'paragraph',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:p',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  );
+  $tokens->stream->add_meta('p', '<i>' . $i);
+
+  $tokens->add_tokendata(
+    foundry => 'opennlp',
+    layer => 'morpho',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      # syntax
+      if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'opennlp_p:' . $found
+	);
+      };
+    });
+
+
+  my $model = 'ne_dewac_175m_600';
+  $tokens->add_tokendata(
+    foundry => 'corenlp',
+    #skip => 1,
+    layer => $model,
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'corenlp_' . $model . ':' . $found
+	);
+      };
+    });
+
+  $model = 'ne_hgc_175m_600';
+  $tokens->add_tokendata(
+    foundry => 'corenlp',
+    #skip => 1,
+    layer => $model,
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'corenlp_' . $model . ':' . $found
+	);
+      };
+    });
+
+  $tokens->add_tokendata(
+    foundry => 'connexor',
+    layer => 'morpho',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      # Lemma
+      if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
+	if (index($found, "\N{U+00a0}") >= 0) {
+	  $found = b($found)->decode;
+	  foreach (split(/\x{00A0}/, $found)) {
+	    $mtt->add(
+	      term => 'cnx_l:' . b($_)->encode
+	    );
+	  }
+	}
+	else {
+	  $mtt->add(
+	    term => 'cnx_l:' . $found # b($found)->encode
+	  );
+	};
+      };
+
+      # POS
+      if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'cnx_p:' . $found
+	);
+      };
+
+      # MSD
+      # Todo: Look in the description!
+      if (($found = $content->at('f[name="msd"]')) && ($found = $found->text)) {
+	foreach (split(':', $found)) {
+	  $mtt->add(
+	    term => 'cnx_m:' . $_
+	  );
+	};
+      };
+    }
+  );
+
+  $tokens->add_tokendata(
+    foundry => 'connexor',
+    layer => 'syntax',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      # syntax
+      if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'cnx_syn:' . $found
+	);
+      };
+    });
+
+  $tokens->add_spandata(
+    foundry => 'connexor',
+    layer => 'phrase',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $span) = @_;
+
+      my $type = $span->content->at('f[name=pos]');
+      if ($type && ($type = $type->text)) {
+	my $mtt = $stream->pos($span->p_start);
+	$mtt->add(
+	  term => '<>:cnx_const:' . $type,
+	  o_start => $span->o_start,
+	  o_end => $span->o_end,
+	  p_end => $span->p_end
+	);
+      };
+    }
+  );
+
+  $tokens->add_tokendata(
+    foundry => 'tree_tagger',
+    #skip => 1,
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      # lemma
+      if (($found = $content->at('f[name="lemma"]')) &&
+	    ($found = $found->text) && $found ne 'UNKNOWN') {
+	$mtt->add(
+	  term => 'tt_l:' . $found
+	);
+      };
+
+      # pos
+      if (($found = $content->at('f[name="ctag"]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'tt_p:' . $found
+	);
+      };
+    });
+
+  $tokens->add_tokendata(
+    foundry => 'mate',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      my $capital = 0;
+
+      # pos
+      if (($found = $content->at('f[name="pos"]')) &&
+	    ($found = $found->text)) {
+	$mtt->add(term => 'mate_p:' . $found
+	);
+      };
+
+      # lemma
+      if (($found = $content->at('f[name="lemma"]'))
+	    && ($found = $found->text)
+	      && $found ne '--') {
+	$mtt->add(term => 'mate_l:' . b($found)->decode('latin-1')->encode->to_string);
+      };
+
+      # MSD
+      if (($found = $content->at('f[name="msd"]')) &&
+	    ($found = $found->text) &&
+	      ($found ne '_')) {
+	foreach (split '\|', $found) {
+	  my ($x, $y) = split "=", $_;
+	  # case, tense, number, mood, person, degree, gender
+	  $mtt->add(term => 'mate_m:' . $x . ':' . $y);
+	};
+      };
+    });
+
+
+  $tokens->add_tokendata(
+    foundry => 'xip',
+    #skip => 1,
+    layer => 'morpho',
+    encoding => 'bytes',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      my $capital = 0;
+      # pos
+      if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'xip_p:' . $found
+	);
+
+	$capital = 1 if $found eq 'NOUN';
+      };
+
+      # lemma
+      if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
+	my (@token) = split('#', $found);
+
+	my $full = '';
+	foreach (@token) {
+	  $full .= $_;
+	  $_ =~ s{/\w+$}{};
+	  $mtt->add(term => 'xip_l:' . $_);
+	};
+	if (@token > 1) {
+	  $full =~ s{/}{}g;
+	  $full = lc $full;
+	  $full = $capital ? ucfirst($full) : $full;
+	  $mtt->add(term => 'xip_l:' . $full);
+	};
+      };
+    });
+
+
+  # Collect all spans and check for roots
+  my %xip_const;
+  my $xip_const_root = Set::Scalar->new;
+  my $xip_const_noroot = Set::Scalar->new;
+
+  # First run:
+  $tokens->add_spandata(
+    foundry => 'xip',
+    layer => 'constituency',
+    encoding => 'bytes',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $span) = @_;
+
+      $xip_const{$span->id} = $span;
+      $xip_const_root->insert($span->id);
+
+      $span->content->find('rel[label=dominates][target]')->each(
+	sub {
+	  my $rel = shift;
+	  $xip_const_noroot->insert($rel->attr('target'));
+	}
+      );
+    }
+  );
+
+  my $stream = $tokens->stream;
+
+  my $add_const = sub {
+    my $span = shift;
+    my $level = shift;
+    my $mtt = $stream->pos($span->p_start);
+
+    my $content = $span->content;
+    my $type = $content->at('f[name=const]');
+    if ($type && ($type = $type->text)) {
+      # $type is now NPA, NP, NUM
+      my %term = (
+	term => '<>:xip_const:' . $type,
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+
+      $term{payload} = '<s>' . $level if $level;
+
+      $mtt->add(%term);
+
+      my $this = __SUB__;
+
+      $content->find('rel[label=dominates][target]')->each(
+	sub {
+	  my $subspan = delete $xip_const{$_[0]->attr('target')} or return;
+	  $this->($subspan, $level + 1);
+	}
+      );
+    };
+  };
+
+  my $diff = $xip_const_root->difference($xip_const_noroot);
+  foreach ($diff->members) {
+    my $obj = delete $xip_const{$_} or next;
+    $add_const->($obj, 0);
+  };
+
+  # Todo: Add mate-morpho
+  # Todo: Add mate-dependency
+  # Todo: Add xip-dependency
+
+  print $tokens->stream->to_string;
+};
+
+if ($ARGV[0]) {
+  parse_doc($ARGV[0]);
+};
+
+
+
+__END__
commit	2db9ad0d484214d641313937810225a635a997a4	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Tue Oct 29 19:26:43 2013 +0000
committer	Nils Diewald <nils@diewald-online.de>	Tue Oct 29 19:26:43 2013 +0000
tree	46f404294a6ec09faf4d1a551a3a03599473dd61