Indexation script finished

commit: 7364d1f29b898a41bb2074a34ce671f7c36ca5de [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Tue Nov 05 19:26:35 2013 +0000
committer: Nils Diewald <nils@diewald-online.de> Tue Nov 05 19:26:35 2013 +0000
tree: 94575601cacd010be2d075a2b8a6e1926fdb60f1
parent: 2db9ad0d484214d641313937810225a635a997a4 [diff]
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 8f80e6c..4a07f8e 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm

@@ -4,11 +4,15 @@
 
 use Mojo::ByteStream 'b';
 use Mojo::DOM;
-use Carp qw/croak carp/;
+use Carp qw/croak/;
 use KorAP::Document::Primary;
 
-has [qw/id corpus_id path/];
-has [qw/pub_date title sub_title pub_place/];
+our @ATTR = qw/id corpus_id pub_date
+	       title sub_title pub_place/;
+has 'path';
+has [@ATTR];
+
+has log => sub { Log::Log4perl->get_logger(__PACKAGE__) };
 
 # parse document
 sub parse {
@@ -17,7 +21,7 @@
 
   state $unable = 'Unable to parse document';
 
-  carp 'Parse document ' . $self->path;
+  $self->log->trace('Parse document ' . $self->path);
 
   my $dom = Mojo::DOM->new($file);
 
@@ -84,7 +88,7 @@
 sub _parse_meta {
   my $self = shift;
 
-  my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1')->encode;
+  my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1');
 
   my $dom = Mojo::DOM->new($file);
   my $monogr = $dom->at('monogr');
@@ -126,6 +130,55 @@
   $self->text_class(@topic);
 };
 
+sub to_string {
+  my $self = shift;
+
+  my $string;
+
+  foreach (@ATTR) {
+    if (my $att = $self->$_) {
+      $att =~ s/\n/ /g;
+      $att =~ s/\s\s+/ /g;
+      $string .= $_ . ' = ' . $att . "\n";
+    };
+  };
+
+  if ($self->author) {
+    foreach (@{$self->author}) {
+      $_ =~ s/\n/ /g;
+      $_ =~ s/\s\s+/ /g;
+      $string .= 'author = ' . $_ . "\n";
+    };
+  };
+
+  if ($self->text_class) {
+    foreach (@{$self->text_class}) {
+      $string .= 'text_class = ' . $_ . "\n";
+    };
+  };
+
+  return $string;
+};
+
+
+sub to_hash {
+  my $self = shift;
+
+  my %hash;
+
+  foreach (@ATTR, 'author', 'text_class') {
+    if (my $att = $self->$_) {
+      $att =~ s/\n/ /g;
+      $att =~ s/\s\s+/ /g;
+      $hash{$_} = $att;
+    };
+  };
+
+  return \%hash;
+};
+
+
+
 
 1;
 

diff --git a/lib/KorAP/Document/Primary.pm b/lib/KorAP/Document/Primary.pm
index 52ca844..be0a234 100644
--- a/lib/KorAP/Document/Primary.pm
+++ b/lib/KorAP/Document/Primary.pm

@@ -5,7 +5,10 @@
 use Mojo::ByteStream 'b';
 use feature 'state';
 use Packed::Array;
+use utf8;
 
+# our $QUOT = b("„“”")->decode;
+our $QUOT_RE = qr/[„“”]/;
 
 # Constructor
 sub new {
@@ -19,13 +22,37 @@
   my $self = shift;
   my ($from, $to) = @_;
 
-  return b(substr($self->[0], $from))->encode if $from && !$to;
+#  return b(substr($self->[0], $from))->encode if $from && !$to;
+  return substr($self->[0], $from) if $from && !$to;
 
-  return b($self->[0])->encode unless $to;
+#  return b($self->[0])->encode unless $to;
+  return $self->[0] unless $to;
 
   my $substr = substr($self->[0], $from, $to - $from);
   if ($substr) {
-    return b($substr)->encode;
+#    return b($substr)->encode;
+    return $substr;
+  };
+  # encode 'UTF-8',
+  croak 'Unable to find substring';
+};
+
+
+sub data_bytes {
+  my $self = shift;
+  my ($from, $to) = @_;
+
+  use bytes;
+
+  return b(substr($self->[0], $from))->decode if $from && !$to;
+
+#  return b($self->[0])->encode unless $to;
+  return b($self->[0])->decode unless $to;
+
+  my $substr = substr($self->[0], $from, $to - $from);
+  if ($substr) {
+#    return b($substr)->encode;
+    return b($substr)->decode;
   };
   # encode 'UTF-8',
   croak 'Unable to find substring';
@@ -45,17 +72,31 @@
 sub bytes2chars {
   my $self = shift;
   unless ($self->[2]) {
-    $self->_calc_chars;
+    $self->[2] = $self->_calc_chars($self->[0]);
   };
   return $self->[2]->[shift];
 };
 
+# Get correct offset
+sub xip2chars {
+  my $self = shift;
+  unless ($self->[3]) {
+    my $buffer = $self->[0];
+
+    # Hacky work around: replace fancy quotation marks for XIP
+    $buffer =~ s{$QUOT_RE}{"}g;
+
+    $self->[3] = $self->_calc_chars($buffer);
+  };
+  return $self->[3]->[shift];
+};
 
 # Calculate character offsets
 sub _calc_chars {
   use bytes;
-
   my $self = shift;
+  my $text = shift;
+
   tie my @array, 'Packed::Array';
 
   state $leading = pack( 'B8', '10000000' );
@@ -65,14 +106,14 @@
   my $c;
 
   # Init array
-  my $l = length($self->[0]);
+  my $l = length($text);
   $array[$l-1] = 0;
 
   # Iterate over every character
-  while ($i < $l) {
+  while ($i <= $l) {
 
     # Get actual character
-    $c = substr($self->[0], $i, 1);
+    $c = substr($text, $i, 1);
 
     # store character position
     $array[$i++] = $j;
@@ -81,7 +122,7 @@
     if (ord($c & $leading) && ord($c & $start)) {
 
       # Get the next byte - expecting a following character
-      $c = substr($self->[0], $i, 1);
+      $c = substr($text, $i, 1);
 
       # Character is part of a multibyte
       while (ord($c & $leading)) {
@@ -90,14 +131,13 @@
 	$array[$i] = (ord($c & $start)) ? ++$j : $j;
 
 	# Get next character
-	$c = substr($self->[0], ++$i, 1);
+	$c = substr($text, ++$i, 1);
       };
     };
 
     $j++;
   };
-
-  $self->[2] = \@array;
+  return \@array;
 };
 
 

diff --git a/lib/KorAP/MultiTerm.pm b/lib/KorAP/Field/MultiTerm.pm
similarity index 95%
rename from lib/KorAP/MultiTerm.pm
rename to lib/KorAP/Field/MultiTerm.pm
index 9dd12a9..7381a0d 100644
--- a/lib/KorAP/MultiTerm.pm
+++ b/lib/KorAP/Field/MultiTerm.pm

@@ -1,4 +1,4 @@
-package KorAP::MultiTerm;
+package KorAP::Field::MultiTerm;
 use Mojo::Base -base;
 
 has [qw/p_start p_end o_start o_end term payload/];

diff --git a/lib/KorAP/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
similarity index 65%
rename from lib/KorAP/MultiTermToken.pm
rename to lib/KorAP/Field/MultiTermToken.pm
index 9df9811..7675737 100644
--- a/lib/KorAP/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm

@@ -1,5 +1,5 @@
-package KorAP::MultiTermToken;
-use KorAP::MultiTerm;
+package KorAP::Field::MultiTermToken;
+use KorAP::Field::MultiTerm;
 use Mojo::Base -base;
 
 has [qw/o_start o_end/];
@@ -9,10 +9,10 @@
   my $mt;
   unless (ref $_[0] eq 'MultiTerm') {
     if (@_ == 1) {
-      $mt = KorAP::MultiTerm->new(term => shift());
+      $mt = KorAP::Field::MultiTerm->new(term => shift());
     }
     else {
-      $mt = KorAP::MultiTerm->new(@_);
+      $mt = KorAP::Field::MultiTerm->new(@_);
     };
   }
   else {
@@ -31,4 +31,9 @@
   return $string;
 };
 
+sub to_array {
+  my $self = shift;
+  [map($_->to_string, @{$self->{mt}})];
+};
+
 1;

diff --git a/lib/KorAP/MultiTermTokenStream.pm b/lib/KorAP/Field/MultiTermTokenStream.pm
similarity index 68%
rename from lib/KorAP/MultiTermTokenStream.pm
rename to lib/KorAP/Field/MultiTermTokenStream.pm
index 6d7dc29..d2e66a8 100644
--- a/lib/KorAP/MultiTermTokenStream.pm
+++ b/lib/KorAP/Field/MultiTermTokenStream.pm

@@ -1,12 +1,12 @@
-package KorAP::MultiTermTokenStream;
+package KorAP::Field::MultiTermTokenStream;
 use Mojo::Base -base;
-use KorAP::MultiTermToken;
+use KorAP::Field::MultiTermToken;
 
 has [qw/oStart oEnd/];
 
 sub add {
   my $self = shift;
-  my $mtt = shift // KorAP::MultiTermToken->new;
+  my $mtt = shift // KorAP::Field::MultiTermToken->new;
   $self->{mtt} //= [];
   push(@{$self->{mtt}}, $mtt);
   return $mtt;
@@ -30,4 +30,9 @@
   return join("\n" , map { $_->to_string } @{$self->{mtt}}) . "\n";
 };
 
+sub to_array {
+  my $self = shift;
+  [ map { $_->to_array } @{$self->{mtt}} ];
+};
+
 1;

diff --git a/lib/KorAP/Index/Base.pm b/lib/KorAP/Index/Base.pm
new file mode 100644
index 0000000..cc53420
--- /dev/null
+++ b/lib/KorAP/Index/Base.pm

@@ -0,0 +1,27 @@
+package KorAP::Index::Base;
+
+use strict;
+use warnings;
+
+sub import {
+  my $class = shift;
+  my $caller = caller;
+
+  no strict 'refs';
+
+  push @{"${caller}::ISA"}, $class;
+
+  strict->import;
+  warnings->import;
+  utf8->import;
+  feature->import(':5.10');
+};
+
+
+sub new {
+  my $class = shift;
+  my $tokens = shift;
+  bless \$tokens, $class;
+};
+
+1;

diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
new file mode 100644
index 0000000..35199d4
--- /dev/null
+++ b/lib/KorAP/Index/Base/Paragraphs.pm

@@ -0,0 +1,30 @@
+package KorAP::Index::Base::Paragraphs;
+use KorAP::Index::Base;
+
+
+
+sub parse {
+  my $self = shift;
+  my $i = 0;
+  $$self->add_spandata(
+    foundry => 'base',
+    layer => 'paragraph',
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:p',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  ) or return;
+
+  $$self->stream->add_meta('p', '<i>' . $i);
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/Connexor/Morpho.pm b/lib/KorAP/Index/Connexor/Morpho.pm
new file mode 100644
index 0000000..74533bf
--- /dev/null
+++ b/lib/KorAP/Index/Connexor/Morpho.pm

@@ -0,0 +1,62 @@
+package KorAP::Index::Connexor::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'connexor',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f};
+
+      my $found;
+
+      my $features = $content->{fs}->{f};
+
+      for my $f (@$features) {
+
+      # Lemma
+	if (($f->{-name} eq 'lemma') && ($found = $f->{'#text'})) {
+	  if (index($found, "\N{U+00a0}") >= 0) {
+	    foreach (split(/\x{00A0}/, $found)) {
+	      $mtt->add(
+		term => 'cnx_l:' . $_
+	      );
+	    }
+	  }
+	  else {
+	    $mtt->add(
+	      term => 'cnx_l:' . $found
+	    );
+	  };
+	}
+
+	# POS
+	elsif (($f->{-name} eq 'pos') && ($found = $f->{'#text'})) {
+	  $mtt->add(
+	    term => 'cnx_p:' . $found
+	  );
+
+	}
+	# MSD
+	# Todo: Look in the description!
+	elsif (($f->{-name} eq 'msd') && ($found = $f->{'#text'})) {
+	  foreach (split(':', $found)) {
+	    $mtt->add(
+	      term => 'cnx_m:' . $_
+	    );
+	  };
+	};
+      };
+    }
+  ) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/Connexor/Phrase.pm b/lib/KorAP/Index/Connexor/Phrase.pm
new file mode 100644
index 0000000..60fcc8e
--- /dev/null
+++ b/lib/KorAP/Index/Connexor/Phrase.pm

@@ -0,0 +1,35 @@
+package KorAP::Index::Connexor::Phrase;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_spandata(
+    foundry => 'connexor',
+    layer => 'phrase',
+    cb => sub {
+      my ($stream, $span) = @_;
+
+      my $content = $span->hash->{fs}->{f};
+
+      return if $content->{-name} ne 'pos';
+
+      my $type = $content->{'#text'};
+
+      if ($type) {
+	my $mtt = $stream->pos($span->p_start);
+	$mtt->add(
+	  term => '<>:cnx_const:' . $type,
+	  o_start => $span->o_start,
+	  o_end => $span->o_end,
+	  p_end => $span->p_end
+	);
+      };
+    }
+  ) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/Connexor/Syntax.pm b/lib/KorAP/Index/Connexor/Syntax.pm
new file mode 100644
index 0000000..35f5079
--- /dev/null
+++ b/lib/KorAP/Index/Connexor/Syntax.pm

@@ -0,0 +1,31 @@
+package KorAP::Index::Connexor::Syntax;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'connexor',
+    layer => 'syntax',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $found;
+      my $spans = $token->hash->{fs}->{f}->{fs}->{f};
+
+
+      # syntax
+      foreach (@$spans) {
+	if (($_->{-name} eq 'pos') && ($found = $_->{'#text'})) {
+	  $mtt->add(
+	    term => 'cnx_syn:' . $found
+	  );
+	};
+      };
+    }) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/CoreNLP/NamedEntities.pm b/lib/KorAP/Index/CoreNLP/NamedEntities.pm
new file mode 100644
index 0000000..8596783
--- /dev/null
+++ b/lib/KorAP/Index/CoreNLP/NamedEntities.pm

@@ -0,0 +1,32 @@
+package KorAP::Index::CoreNLP::NamedEntities;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+  my $model = shift;
+
+  $$self->add_tokendata(
+    foundry => 'corenlp',
+    layer => $model,
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f} or return;
+      my $found;
+
+      if (($content->{-name} eq 'ne') &&
+	    ($found = $content->{fs}) &&
+	      ($found = $found->{f}) &&
+		($found->{-name} eq 'ent') &&
+		  ($found = $found->{'#text'})) {
+	$mtt->add(
+	  term => 'corenlp_' . $model . ':' . $found
+	);
+      };
+    }) or return;
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/Mate/Dependency.pm b/lib/KorAP/Index/Mate/Dependency.pm
new file mode 100644
index 0000000..d300363
--- /dev/null
+++ b/lib/KorAP/Index/Mate/Dependency.pm

@@ -0,0 +1,56 @@
+package KorAP::Index::Mate::Dependency;
+use KorAP::Index::Base;
+use Data::Dumper;
+
+sub parse {
+  my $self = shift;
+
+  # TODO: Create XIP tree here - for indirect dependency
+  # >>:xip_d:SUBJ<i>566<i>789
+
+  $$self->add_tokendata(
+    foundry => 'mate',
+    layer => 'dependency',
+    cb => sub {
+      my ($stream, $token, $tokens) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash;
+
+      my $rel = $content->{rel};
+      $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+      foreach (@$rel) {
+	my $label = $_->{-label};
+
+	if ($_->{-type} && $_->{-type} eq 'unary') {
+	  next if $_->{-label} eq '--';
+	  $mtt->add(
+	    term => 'mate_d:' . $label
+	  );
+	}
+	else {
+
+	  my $from = $_->{span}->{-from};
+	  my $to   = $_->{span}->{-to};
+
+	  my $rel_token = $tokens->token($from, $to) or next;
+
+	  $mtt->add(
+	    term => '>:mate_d:' . $label,
+	    payload => '<i>' . $rel_token->pos
+	  );
+
+	  $stream->pos($rel_token->pos)->add(
+	    term => '<:mate_d:' . $label,
+	    payload => '<i>' . $token->pos
+	  );
+	};
+      };
+    }) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/Mate/Morpho.pm b/lib/KorAP/Index/Mate/Morpho.pm
new file mode 100644
index 0000000..b1a6803
--- /dev/null
+++ b/lib/KorAP/Index/Mate/Morpho.pm

@@ -0,0 +1,54 @@
+package KorAP::Index::Mate::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'mate',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f};
+
+      my $found;
+
+      my $capital = 0;
+
+      foreach my $f (@{$content->{fs}->{f}}) {
+
+	# pos
+	if (($f->{-name} eq 'pos') &&
+	      ($found = $f->{'#text'})) {
+	  $mtt->add(term => 'mate_p:' . $found
+		  );
+	}
+
+	# lemma
+	elsif (($f->{-name} eq 'lemma')
+		 && ($found = $f->{'#text'})
+		   && $found ne '--') {
+	  # b($found)->decode('latin-1')->encode->to_string
+	  $mtt->add(term => 'mate_l:' . $found);
+	}
+
+	# MSD
+	elsif (($f->{-name} eq 'msd') &&
+		 ($found = $f->{'#text'}) &&
+		   ($found ne '_')) {
+	  foreach (split '\|', $found) {
+	    my ($x, $y) = split "=", $_;
+	    # case, tense, number, mood, person, degree, gender
+	    $mtt->add(term => 'mate_m:' . $x . ':' . $y);
+	  };
+	};
+      };
+    }) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/OpenNLP/Morpho.pm b/lib/KorAP/Index/OpenNLP/Morpho.pm
new file mode 100644
index 0000000..20716b0
--- /dev/null
+++ b/lib/KorAP/Index/OpenNLP/Morpho.pm

@@ -0,0 +1,30 @@
+package KorAP::Index::OpenNLP::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'opennlp',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f} or return;
+
+      $content = $content->{fs}->{f};
+      my $found;
+
+      # syntax
+      if (($content->{-name} eq 'pos') && ($content->{'#text'})) {
+	$mtt->add(
+	  term => 'opennlp_p:' . $content->{'#text'}
+	);
+      };
+    }) or return;
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
new file mode 100644
index 0000000..7868fb2
--- /dev/null
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm

@@ -0,0 +1,29 @@
+package KorAP::Index::OpenNLP::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+  my $i = 0;
+
+  $$self->add_spandata(
+    foundry => 'opennlp',
+    layer => 'sentences',
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:s',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  ) or return;
+
+  $$self->stream->add_meta('s', '<i>' . $i);
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/TreeTagger/Morpho.pm b/lib/KorAP/Index/TreeTagger/Morpho.pm
new file mode 100644
index 0000000..67994ac
--- /dev/null
+++ b/lib/KorAP/Index/TreeTagger/Morpho.pm

@@ -0,0 +1,48 @@
+package KorAP::Index::TreeTagger::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'tree_tagger',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f};
+
+      my $found;
+
+      $content = ref $content ne 'ARRAY' ? [$content] : $content;
+
+      foreach my $fs (@$content) {
+	$content = $fs->{fs}->{f};
+	foreach (@$content) {
+
+	  # lemma
+	  if (($_->{-name} eq 'lemma') &&
+		($found = $_->{'#text'}) &&
+		  ($found ne 'UNKNOWN') &&
+		    ($found ne '?')) {
+	    $mtt->add(
+	      term => 'tt_l:' . $found
+	    );
+	  };
+
+	  # pos
+	  if (($_->{-name} eq 'ctag') && ($found = $_->{'#text'})) {
+	    $mtt->add(
+	      term => 'tt_p:' . $found
+	    );
+	  };
+	};
+      };
+    }) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/XIP/Constituency.pm b/lib/KorAP/Index/XIP/Constituency.pm
new file mode 100644
index 0000000..61c2a5e
--- /dev/null
+++ b/lib/KorAP/Index/XIP/Constituency.pm

@@ -0,0 +1,83 @@
+package KorAP::Index::XIP::Constituency;
+use KorAP::Index::Base;
+use Set::Scalar;
+use v5.16;
+
+sub parse {
+  my $self = shift;
+
+  # Collect all spans and check for roots
+  my %xip_const;
+  my $xip_const_root = Set::Scalar->new;
+  my $xip_const_noroot = Set::Scalar->new;
+
+  # First run:
+  $$self->add_spandata(
+    foundry => 'xip',
+    layer => 'constituency',
+    encoding => 'xip',
+    cb => sub {
+      my ($stream, $span) = @_;
+
+      $xip_const{$span->id} = $span;
+      $xip_const_root->insert($span->id);
+
+      my $rel = $span->hash->{rel} or return;
+      $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+      foreach (@$rel) {
+	if ($_->{-label} eq 'dominates' && $_->{-target}) {
+	  $xip_const_noroot->insert($_->{-target});
+	};
+      };
+    }
+  ) or return;
+
+  my $stream = $$self->stream;
+
+  my $add_const = sub {
+    my $span = shift;
+    my $level = shift;
+    my $mtt = $stream->pos($span->p_start);
+
+    my $content = $span->hash;
+    my $f = $content->{fs}->{f};
+    return unless $f->{-name} eq 'const';
+
+    my $type = $f->{'#text'} or return;
+
+    # $type is now NPA, NP, NUM ...
+    my %term = (
+      term => '<>:xip_const:' . $type,
+      o_start => $span->o_start,
+      o_end => $span->o_end,
+      p_end => $span->p_end
+    );
+
+    $term{payload} = '<s>' . $level if $level;
+
+    $mtt->add(%term);
+
+    my $this = __SUB__;
+
+    my $rel = $content->{rel} or return;
+    $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+    foreach (@$rel) {
+      next if $_->{-label} ne 'dominates' || !$_->{-target};
+      my $subspan = delete $xip_const{$_->{-target}} or return;
+      $this->($subspan, $level + 1);
+    };
+  };
+
+  my $diff = $xip_const_root->difference($xip_const_noroot);
+  foreach ($diff->members) {
+    my $obj = delete $xip_const{$_} or next;
+    $add_const->($obj, 0);
+  };
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/XIP/Dependency.pm b/lib/KorAP/Index/XIP/Dependency.pm
new file mode 100644
index 0000000..ca889d8
--- /dev/null
+++ b/lib/KorAP/Index/XIP/Dependency.pm

@@ -0,0 +1,58 @@
+package KorAP::Index::XIP::Dependency;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  # TODO: Create XIP tree here - for indirect dependency
+  # >>:xip_d:SUBJ<i>566<i>789
+
+  $$self->add_tokendata(
+    foundry => 'xip',
+    layer => 'dependency',
+    encoding => 'xip',
+    cb => sub {
+      my ($stream, $token, $tokens) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash;
+
+      my $rel = $content->{rel};
+      $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+      foreach (@$rel) {
+	my $label = $_->{-label};
+
+	if ($_->{-type} && $_->{-type} eq 'unary') {
+	  $mtt->add(
+	    term => 'xip_d:' . $label
+	  );
+	}
+	else {
+
+	  my $from = $_->{span}->{-from};
+	  my $to   = $_->{span}->{-to};
+
+	  my $rel_token = $tokens->token($from, $to) or next;
+
+	  # die $token->pos . ' -' . $label . '-> ' . $rel_token->pos;
+	  $mtt->add(
+	    term => '>:xip_d:' . $label,
+	    payload => '<i>' . $rel_token->pos
+	  );
+
+	  $stream->pos($rel_token->pos)->add(
+	    term => '<:xip_d:' . $label,
+	    payload => '<i>' . $token->pos
+	  );
+	};
+
+#	print $label,"\n";
+      };
+    }) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/XIP/Morpho.pm b/lib/KorAP/Index/XIP/Morpho.pm
new file mode 100644
index 0000000..d568afe
--- /dev/null
+++ b/lib/KorAP/Index/XIP/Morpho.pm

@@ -0,0 +1,61 @@
+package KorAP::Index::XIP::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'xip',
+    layer => 'morpho',
+    encoding => 'xip',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f}->{fs}->{f};
+
+      my $found;
+
+      my $capital = 0;
+      foreach (@$content) {
+	# pos
+	if (($_->{-name} eq 'pos') && ($found = $_->{'#text'})) {
+	  $mtt->add(
+	    term => 'xip_p:' . $found
+	  );
+
+	  $capital = 1 if $found eq 'NOUN';
+	}
+      };
+
+      foreach (@$content) {
+	# lemma
+	if (($_->{-name} eq 'lemma') && ($found = $_->{'#text'})) {
+
+	  # Verb delimiter (aus=druecken)
+	  $found =~ tr/=//d;
+
+	  # Composites
+	  my (@token) = split('#', $found);
+
+	  my $full = '';
+	  foreach (@token) {
+	    $full .= $_;
+	    $_ =~ s{/\w+$}{};
+	    $mtt->add(term => 'xip_l:' . $_);
+	  };
+	  if (@token > 1) {
+	    $full =~ s{/}{}g;
+	    $full = lc $full;
+	    $full = $capital ? ucfirst($full) : $full;
+	    $mtt->add(term => 'xip_l:' . $full);
+	  };
+	};
+      };
+    }) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 7038cc4..4920215 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm

@@ -1,16 +1,18 @@
 package KorAP::Tokenizer;
-
 use Mojo::Base -base;
 use Mojo::ByteStream 'b';
-use Carp qw/carp croak/;
+use Mojo::Loader;
+use Carp qw/croak/;
 use KorAP::Tokenizer::Range;
 use KorAP::Tokenizer::Match;
 use KorAP::Tokenizer::Spans;
 use KorAP::Tokenizer::Tokens;
-use KorAP::MultiTermTokenStream;
+use KorAP::Field::MultiTermTokenStream;
+use JSON::XS;
 use Log::Log4perl;
 
-has [qw/path foundry layer doc stream should have/];
+has [qw/path foundry doc stream should have name/];
+has layer => 'Tokens';
 
 has 'log' => sub {
   Log::Log4perl->get_logger(__PACKAGE__)
@@ -21,8 +23,8 @@
   my $self = shift;
 
   # Create new token stream
-  my $mtts = KorAP::MultiTermTokenStream->new;
-  my $file = b($self->path . $self->foundry . '/' . ($self->layer // 'tokens') . '.xml')->slurp;
+  my $mtts = KorAP::Field::MultiTermTokenStream->new;
+  my $file = b($self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml')->slurp;
   my $tokens = Mojo::DOM->new($file);
   $tokens->xml(1);
 
@@ -121,13 +123,12 @@
 
   my $cb = delete $param{cb};
 
-  if ($param{encoding} && $param{encoding} eq 'bytes') {
-    $param{primary} = $self->doc->primary;
-  };
+  $param{primary} = $self->doc->primary;
 
   my $spans = KorAP::Tokenizer::Spans->new(
     path => $self->path,
     range => $self->range,
+    match => $self->match,
     %param
   );
 
@@ -140,12 +141,11 @@
     $self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %');
   };
 
-
   if ($cb) {
     foreach (@$spanarray) {
-      $cb->($self->stream, $_);
+      $cb->($self->stream, $_, $spans);
     };
-    return;
+    return 1;
   };
   return $spans;
 };
@@ -165,12 +165,11 @@
 
   my $cb = delete $param{cb};
 
-  if ($param{encoding} && $param{encoding} eq 'bytes') {
-    $param{primary} = $self->doc->primary;
-  };
+  $param{primary} = $self->doc->primary;
 
   my $tokens = KorAP::Tokenizer::Tokens->new(
     path => $self->path,
+    range => $self->range,
     match => $self->match,
     %param
   );
@@ -189,14 +188,35 @@
 
   if ($cb) {
     foreach (@$tokenarray) {
-      $cb->($self->stream, $_);
+      $cb->($self->stream, $_, $tokens);
     };
-    return;
+    return 1;
   };
   return $tokens;
 };
 
 
+sub add {
+  my $self = shift;
+  my $loader = Mojo::Loader->new;
+  my $foundry = shift;
+  my $layer = shift;
+  my $mod = 'KorAP::Index::' . $foundry . '::' . $layer;
+
+  if ($mod->can('new') || eval("require $mod; 1;")) {
+    if (my $retval = $mod->new($self)->parse(@_)) {
+      $self->support($foundry => $layer, @_);
+      return $retval;
+    };
+  }
+  else {
+    $self->log->error('Unable to load '.$mod . '(' . $@ . ')');
+  };
+
+  return;
+};
+
+
 sub _perc {
   if (@_ == 2) {
     # '[' . $_[0] . '/' . $_[1] . ']' .
@@ -215,6 +235,73 @@
 };
 
 
+sub support {
+  my $self = shift;
+  unless ($_[0]) {
+    return $self->{support} // {};
+  }
+  elsif (!$_[1]) {
+    return $self->{support}->{$_[0]} // []
+  };
+  my $f = lc shift;
+  my $l = lc shift;
+  my @info = @_;
+  $self->{support} //= {};
+  $self->{support}->{$f} //= [];
+  push(@{$self->{support}->{$f}}, [$l, @info]);
+};
+
+
+sub to_string {
+  my $self = shift;
+  my $primary = defined $_[0] ? $_[0] : 1;
+  my $string = "<meta>\n";
+  $string .= $self->doc->to_string;
+  $string .= "</meta>\n";
+  if ($primary) {
+    $string .= "<text>\n";
+    $string .= $self->doc->primary->data . "\n";
+    $string .= "</text>\n";
+  };
+  $string .= '<field name="' . $self->name . "\">\n";
+  $string .= "<info>\n";
+  $string .= 'tokenization = ' . $self->foundry . '#' . $self->layer . "\n";
+  foreach my $foundry (keys %{$self->support}) {
+    foreach (@{$self->support($foundry)}) {
+      $string .= 'support = ' . $foundry . '#' . join(',', @{$_}) . "\n";
+    };
+  };
+  $string .= "</info>\n";
+  $string .= $self->stream->to_string;
+  $string .= "</field>";
+  return $string;
+};
+
+sub to_data {
+  my $self = shift;
+  my $primary = defined $_[0] ? $_[0] : 1;
+  my %data;
+  $data{meta} = $self->doc->to_hash;
+  $data{primary} = $self->doc->primary->data if $primary;
+  $data{fields} = [ {
+    name => $self->name,
+    data => $self->stream->to_array,
+    tokenization => [lc($self->foundry), lc($self->layer)],
+    support => $self->support
+  }];
+  \%data;
+};
+
+sub to_json {
+  encode_json($_[0]->to_data($_[1]));
+};
+
+
+sub to_pretty_json {
+  JSON::XS->new->pretty->encode($_[0]->to_data($_[1]));
+};
+
+
 1;
 
 
@@ -276,7 +363,7 @@
 
   $tokens->stream->add_meta('adjCount', '<i>45');
 
-The L<KorAP::MultiTermTokenStream> object
+The L<KorAP::Field::MultiTermTokenStream> object
 
 
 =head2 range
@@ -321,7 +408,7 @@
 Add span information to the parsed token stream.
 Expects a C<foundry> name, a C<layer> name and a
 callback parameter, that will be called after each parsed
-span. The L<KorAP::MultiTermTokenStream> object will be passed,
+span. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
 as well as the current L<KorAP::Tokenizer::Span>.
 
 An optional parameter C<encoding> may indicate that the span offsets
@@ -351,7 +438,7 @@
 Add token information to the parsed token stream.
 Expects a C<foundry> name, a C<layer> name and a
 callback parameter, that will be called after each parsed
-token. The L<KorAP::MultiTermTokenStream> object will be passed,
+token. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
 as well as the current L<KorAP::Tokenizer::Span>.
 
 An optional parameter C<encoding> may indicate that the token offsets

diff --git a/lib/KorAP/Tokenizer/Match.pm b/lib/KorAP/Tokenizer/Match.pm
index e5a96f8..2a06aea 100644
--- a/lib/KorAP/Tokenizer/Match.pm
+++ b/lib/KorAP/Tokenizer/Match.pm

@@ -8,10 +8,21 @@
 
 sub set {
   $_[0]->{$_[1] . ':' . $_[2]} = $_[3];
+  $_[0]->{'[' . $_[1]} = $_[3];
+  $_[0]->{$_[2] . ']'} = $_[3];
 };
 
 sub lookup {
   $_[0]->{$_[1] . ':' . $_[2]} // undef;
 };
 
+sub startswith {
+  $_[0]->{'[' . $_[1]} // undef;
+};
+
+sub endswith {
+  $_[0]->{$_[1] . ']'} // undef;
+};
+
+
 1;

diff --git a/lib/KorAP/Tokenizer/Range.pm b/lib/KorAP/Tokenizer/Range.pm
index c18136b..110fbc6 100644
--- a/lib/KorAP/Tokenizer/Range.pm
+++ b/lib/KorAP/Tokenizer/Range.pm

@@ -35,9 +35,9 @@
 
   if ($found =~ /!(\d+):(\d+)$/) {
     return $1 >= 0 ? $1 : 0;
-  }
-  else {
-    return $found;
+#  }
+#  else {
+#    return $found;
   };
 };
 
@@ -46,9 +46,9 @@
   my $found = $$self->lookup( shift() );
   if ($found =~ /!(\d+):(\d+)$/) {
     return $2;
-  }
-  else {
-    return $found;
+#  }
+#  else {
+#    return $found;
   };
 };
 

diff --git a/lib/KorAP/Tokenizer/Span.pm b/lib/KorAP/Tokenizer/Span.pm
index 0010d7c..485e700 100644
--- a/lib/KorAP/Tokenizer/Span.pm
+++ b/lib/KorAP/Tokenizer/Span.pm

@@ -42,29 +42,33 @@
   $_[0]->[4];
 };
 
-
 sub content {
   if (defined $_[1]) {
     $_[0]->[5] = $_[1];
   }
   else {
-    if ($_[0]->processed) {
-      return $_[0]->[5];
-    }
-    else {
-      my $c = Mojo::DOM->new($_[0]->[5]);
-      $c->xml(1);
-      $_[0]->processed(1);
-      return $_[0]->[5] = $c;
-    };
+    return $_[0]->[5];
   };
 };
 
-sub processed {
-  if (defined $_[1]) {
-    $_[0]->[6] = $_[1] ? 1 : 0;
+sub dom {
+  if ($_[0]->[6]) {
+    return $_[0]->[6];
+  }
+  else {
+    my $c = Mojo::DOM->new($_[0]->[5]);
+    $c->xml(1);
+    return $_[0]->[6] = $c;
   };
-  $_[0]->[6];
+};
+
+sub hash {
+  if (defined $_[1]) {
+    $_[0]->[7] = $_[1];
+  }
+  else {
+    return $_[0]->[7];
+  };
 };
 
 1;

diff --git a/lib/KorAP/Tokenizer/Spans.pm b/lib/KorAP/Tokenizer/Spans.pm
index 7e1a382..7eb31c0 100644
--- a/lib/KorAP/Tokenizer/Spans.pm
+++ b/lib/KorAP/Tokenizer/Spans.pm

@@ -1,58 +1,43 @@
 package KorAP::Tokenizer::Spans;
-use Mojo::Base -base;
+use Mojo::Base 'KorAP::Tokenizer::Units';
 use KorAP::Tokenizer::Span;
 use Mojo::DOM;
 use Mojo::ByteStream 'b';
+use XML::Fast;
 
-has [qw/path foundry layer range primary should have/];
-has 'encoding' => 'utf-8';
+has 'range';
 
 sub parse {
   my $self = shift;
   my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
 
-  my $spans = Mojo::DOM->new($file);
-  $spans->xml(1);
+  # my $spans = Mojo::DOM->new($file);
+  # $spans->xml(1);
+
+  # my $spans = XML::LibXML->load_xml(string => $file);
+
+  my $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList}->{span};
 
   my ($should, $have) = (0,0);
-  my ($from, $to);
+  my ($from, $to, $h);
 
   my @spans;
-  $spans->find('span')->each(
-    sub {
-      my $s = shift;
+  my $p = $self->primary;
 
-      $should++;
+  foreach my $s (@$spans) {
 
-      if ($self->encoding eq 'bytes') {
-	$from = $self->primary->bytes2chars($s->attr('from'));
-	$to = $self->primary->bytes2chars($s->attr('to'));
-      }
-      else {
-	$from = $s->attr('from');
-	$to = $s->attr('to');
-      };
+    $should++;
 
-      return unless $to > $from;
+    my $span = $self->span(
+      $s->{-from},
+      $s->{-to},
+      $s
+    ) or next;
 
-      my $span = KorAP::Tokenizer::Span->new;
+    $have++;
 
-      $span->id($s->attr('id'));
-      $span->o_start($from);
-      $span->o_end($to);
-      $span->p_start($self->range->after($span->o_start));
-      $span->p_end($self->range->before($span->o_end));
-
-      return unless $span->p_end >= $span->p_start;
-
-      if (@{$s->children}) {
-	$span->content($s->content_xml);
-      };
-
-      $have++;
-
-      push(@spans, $span);
-    });
+    push(@spans, $span);
+  };
 
   $self->should($should);
   $self->have($have);

diff --git a/lib/KorAP/Tokenizer/Token.pm b/lib/KorAP/Tokenizer/Token.pm
index f6c1971..fe12a25 100644
--- a/lib/KorAP/Tokenizer/Token.pm
+++ b/lib/KorAP/Tokenizer/Token.pm

@@ -14,14 +14,13 @@
   $_[0]->[0];
 };
 
+
 sub content {
-  if ($_[1]) {
+  if (defined $_[1]) {
     $_[0]->[1] = $_[1];
   }
   else {
-    my $c = Mojo::DOM->new($_[0]->[1]);
-    $c->xml(1);
-    return $c;
+    return $_[0]->[1];
   };
 };
 
@@ -34,4 +33,25 @@
   };
 };
 
+sub dom {
+  if ($_[0]->[3]) {
+    return $_[0]->[3];
+  }
+  else {
+    my $c = Mojo::DOM->new($_[0]->[1]);
+    $c->xml(1);
+    return $_[0]->[3] = $c;
+  };
+};
+
+sub hash {
+  if (defined $_[1]) {
+    $_[0]->[4] = $_[1];
+  }
+  else {
+    return $_[0]->[4];
+  };
+};
+
+
 1;

diff --git a/lib/KorAP/Tokenizer/Tokens.pm b/lib/KorAP/Tokenizer/Tokens.pm
index 7657460..58ff6fa 100644
--- a/lib/KorAP/Tokenizer/Tokens.pm
+++ b/lib/KorAP/Tokenizer/Tokens.pm

@@ -1,56 +1,38 @@
 package KorAP::Tokenizer::Tokens;
-use Mojo::Base -base;
+use Mojo::Base 'KorAP::Tokenizer::Units';
 use Mojo::DOM;
 use Mojo::ByteStream 'b';
 use KorAP::Tokenizer::Token;
+use Carp qw/croak carp/;
+use XML::Fast;
 
-has [qw/path foundry layer match primary should have/];
-has 'encoding' => 'utf-8';
 
 sub parse {
   my $self = shift;
   my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
 
-  my $spans = Mojo::DOM->new($file);
-  $spans->xml(1);
+#  my $spans = Mojo::DOM->new($file);
+#  $spans->xml(1);
+  my $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList}->{span};
 
   my ($should, $have) = (0,0);
-  my ($from, $to);
-
-  my $match = $self->match;
 
   my @tokens;
-  $spans->find('span')->each(
-    sub {
-      my $s = shift;
 
-      $should++;
+  foreach my $s (@$spans) {
 
-      if ($self->encoding eq 'bytes') {
-	$from = $self->primary->bytes2chars($s->attr('from'));
-	$to = $self->primary->bytes2chars($s->attr('to'));
-      }
-      else {
-	$from = $s->attr('from');
-	$to = $s->attr('to');
-      };
+    $should++;
 
-      my $pos = $match->lookup($from, $to);
+    my $token = $self->token(
+      $s->{-from},
+      $s->{-to},
+      $s
+    ) or next;
 
-      return unless defined $pos;
+    $have++;
 
-      my $token = KorAP::Tokenizer::Token->new;
-      $token->id($s->attr('id'));
-      $token->pos($pos);
-
-      if (@{$s->children}) {
-	$token->content($s->content_xml);
-      };
-
-      $have++;
-
-      push(@tokens, $token);
-    });
+    push(@tokens, $token);
+  };
 
   $self->should($should);
   $self->have($have);

diff --git a/lib/KorAP/Tokenizer/Units.pm b/lib/KorAP/Tokenizer/Units.pm
new file mode 100644
index 0000000..c04ebaf
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Units.pm

@@ -0,0 +1,89 @@
+package KorAP::Tokenizer::Units;
+use KorAP::Tokenizer::Span;
+use KorAP::Tokenizer::Token;
+use Mojo::Base -base;
+
+has [qw/path foundry layer match range primary should have/];
+has 'encoding' => 'utf-8';
+
+sub span {
+  my $self = shift;
+  my ($from, $to, $s) = @_;
+
+  ($from, $to) = $self->_offset($from, $to);
+
+  return unless $to > $from;
+
+  my $span = KorAP::Tokenizer::Span->new;
+
+  $span->id($s->{-id}) if $s && $s->{-id};
+
+  $span->o_start($from);
+  $span->o_end($to);
+
+  my $start = $self->match->startswith($span->o_start);
+
+  unless (defined $start) {
+    $start = $self->range->after($span->o_start) or return;
+  };
+
+  $span->p_start($start);
+
+  my $end = $self->match->endswith($span->o_end);
+  unless ($end) {
+    $end = $self->range->before($span->o_end) or return;
+  };
+  $span->p_end($end);
+
+  return unless $span->p_end >= $span->p_start;
+
+  $span->hash($s) if $s;
+
+  return $span;
+};
+
+sub token {
+  my $self = shift;
+  my ($from, $to, $s) = @_;
+
+  ($from, $to) = $self->_offset($from, $to);
+
+  my $pos = $self->match->lookup($from, $to);
+
+  return unless defined $pos;
+
+#      if ($from == $to) {
+#	print "Unable to find match for $from - $to (resp ".$s->{-from} . '-' . $s->{-to}.") " . $s->{-id};
+#	print "\n";
+#      };
+
+  my $token = KorAP::Tokenizer::Token->new;
+  $token->id($s->{-id}) if $s && $s->{-id};
+  $token->pos($pos);
+
+  $token->hash($s) if $s;
+
+  return $token;
+};
+
+
+sub _offset {
+  my $self = shift;
+  my ($from, $to) = @_;
+
+  if ($self->encoding) {
+    my $p = $self->primary;
+    if ($self->encoding eq 'bytes') {
+      $from = $p->bytes2chars($from);
+      $to = $p->bytes2chars($to);
+    }
+    elsif ($self->encoding eq 'xip') {
+      $from = $p->xip2chars($from);
+      $to = $p->xip2chars($to);
+    };
+  };
+
+  return ($from, $to);
+};
+
+1;
commit	7364d1f29b898a41bb2074a34ce671f7c36ca5de	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Tue Nov 05 19:26:35 2013 +0000
committer	Nils Diewald <nils@diewald-online.de>	Tue Nov 05 19:26:35 2013 +0000
tree	94575601cacd010be2d075a2b8a6e1926fdb60f1
parent	2db9ad0d484214d641313937810225a635a997a4 [diff]