Sentence annotations for all providing foundries and a beginning subtokenization based on cschnobers code

commit: f03c680ecc25127bdeea6ecd9bfac68cf02af912 [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Mon Jul 21 16:39:44 2014 +0000
committer: Nils Diewald <nils@diewald-online.de> Mon Jul 21 16:39:44 2014 +0000
tree: 66d16134973fca13b6f4c781ce10922895c8f343
parent: ff6d078115bb1f2965fa7962c39e11a22f8d0df3 [diff]
diff --git a/lib/KorAP/Bundle.pm b/lib/KorAP/Bundle.pm
deleted file mode 100644
index 43997e3..0000000
--- a/lib/KorAP/Bundle.pm
+++ /dev/null

@@ -1,5 +0,0 @@
-package KorAP::Bundle;
-
-our $VERSION = 0.01;
-
-1;

diff --git a/lib/KorAP/Field/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
index 6e69704..6e43493 100644
--- a/lib/KorAP/Field/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm

@@ -24,6 +24,15 @@
   return $mt;
 };
 
+
+sub surface {
+  $_[0]->{mt}->[0]->term;
+};
+
+sub lc_surface {
+  $_[0]->{mt}->[1]->term;
+};
+
 sub to_string {
   my $self = shift;
   my $string = '[(' . $self->o_start . '-'. $self->o_end . ')';

diff --git a/lib/KorAP/Field/MultiTermTokenStream.pm b/lib/KorAP/Field/MultiTermTokenStream.pm
index ea96a3e..f9e97a2 100644
--- a/lib/KorAP/Field/MultiTermTokenStream.pm
+++ b/lib/KorAP/Field/MultiTermTokenStream.pm

@@ -30,6 +30,10 @@
   return join("\n" , map { $_->to_string } @{$self->{mtt}}) . "\n";
 };
 
+sub multi_term_tokens {
+  $_[0]->{mtt};
+};
+
 sub to_array {
   my $self = shift;
   [ map { $_->to_array } @{$self->{mtt}} ];

diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
index c2670bc..f0b6e6d 100644
--- a/lib/KorAP/Index/Base/Paragraphs.pm
+++ b/lib/KorAP/Index/Base/Paragraphs.pm

@@ -11,7 +11,7 @@
       my ($stream, $span) = @_;
       my $mtt = $stream->pos($span->p_start);
       $mtt->add(
-	term => '<>:base/para',
+	term => '<>:base/s:p',
 	o_start => $span->o_start,
 	o_end => $span->o_end,
 	p_end => $span->p_end

diff --git a/lib/KorAP/Index/Base/Sentences.pm b/lib/KorAP/Index/Base/Sentences.pm
index 8c3c296..a9fcd25 100644
--- a/lib/KorAP/Index/Base/Sentences.pm
+++ b/lib/KorAP/Index/Base/Sentences.pm

@@ -15,7 +15,7 @@
       my $mtt = $stream->pos($span->p_start);
       $first = [$span->p_start, $span->o_start] unless defined $first;
       $mtt->add(
-	term => '<>:base/s',
+	term => '<>:base/s:s',
 	o_start => $span->o_start,
 	o_end => $span->o_end,
 	p_end => $span->p_end
@@ -28,7 +28,7 @@
 
   my $mt = $$self->stream->pos($first->[0]);
   $mt->add(
-    term => '<>:base/text',
+    term => '<>:base/s:t',
     o_start => $first->[1],
     p_end => $last_p,
     o_end => $last_o

diff --git a/lib/KorAP/Index/Connexor/Sentences.pm b/lib/KorAP/Index/Connexor/Sentences.pm
new file mode 100644
index 0000000..04cee09
--- /dev/null
+++ b/lib/KorAP/Index/Connexor/Sentences.pm

@@ -0,0 +1,29 @@
+package KorAP::Index::Connexor::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+  my $i = 0;
+
+  $$self->add_spandata(
+    foundry => 'connexor',
+    layer => 'sentences',
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:cnx/s:s',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  ) or return;
+
+  $$self->stream->add_meta('cnx/sentences', '<i>' . $i);
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/CoreNLP/Constituency.pm b/lib/KorAP/Index/CoreNLP/Constituency.pm
new file mode 100644
index 0000000..4793bfd
--- /dev/null
+++ b/lib/KorAP/Index/CoreNLP/Constituency.pm

@@ -0,0 +1,85 @@
+package KorAP::Index::CoreNLP::Constituency;
+use KorAP::Index::Base;
+use Set::Scalar;
+use v5.16;
+
+sub parse {
+  my $self = shift;
+
+  # Collect all spans and check for roots
+  my %corenlp_const;
+  my $corenlp_const_root = Set::Scalar->new;
+  my $corenlp_const_noroot = Set::Scalar->new;
+
+  # First run:
+  $$self->add_spandata(
+    foundry => 'corenlp',
+    layer => 'constituency',
+    cb => sub {
+      my ($stream, $span) = @_;
+
+      $corenlp_const{$span->id} = $span;
+      $corenlp_const_root->insert($span->id);
+
+      my $rel = $span->hash->{rel} or return;
+      $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+      foreach (@$rel) {
+	if ($_->{-label} eq 'dominates' && $_->{-target}) {
+	  $corenlp_const_noroot->insert($_->{-target});
+	};
+      };
+    }
+  ) or return;
+
+  my $stream = $$self->stream;
+
+  my $add_const = sub {
+    my $span = shift;
+    my $level = shift;
+    my $mtt = $stream->pos($span->p_start);
+
+    my $content = $span->hash;
+    my $f = $content->{fs}->{f};
+    return unless $f->{-name} eq 'const';
+
+    my $type = $f->{'#text'} or return;
+
+    # $type is now NPA, NP, NUM ...
+    my %term = (
+      term => '<>:corenlp/c:' . $type,
+      o_start => $span->o_start,
+      o_end => $span->o_end,
+      p_end => $span->p_end
+    );
+
+    $term{payload} = '<b>' . $level if $level;
+
+    $mtt->add(%term);
+
+    my $this = __SUB__;
+
+    my $rel = $content->{rel} or return;
+    $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+    foreach (@$rel) {
+      next if $_->{-label} ne 'dominates' || !$_->{-target};
+      my $subspan = delete $corenlp_const{$_->{-target}} or return;
+      $this->($subspan, $level + 1);
+    };
+  };
+
+  my $diff = $corenlp_const_root->difference($corenlp_const_noroot);
+  foreach ($diff->members) {
+    my $obj = delete $corenlp_const{$_} or next;
+    $add_const->($obj, 0);
+  };
+
+  return 1;
+};
+
+sub layer_info {
+    ['corenlp/c=const']
+}
+
+1;

diff --git a/lib/KorAP/Index/CoreNLP/Sentences.pm b/lib/KorAP/Index/CoreNLP/Sentences.pm
new file mode 100644
index 0000000..1bd84e0
--- /dev/null
+++ b/lib/KorAP/Index/CoreNLP/Sentences.pm

@@ -0,0 +1,29 @@
+package KorAP::Index::CoreNLP::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+  my $i = 0;
+
+  $$self->add_spandata(
+    foundry => 'corenlp',
+    layer => 'sentences',
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:corenlp/s:s',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  ) or return;
+
+  $$self->stream->add_meta('corenlp/sentences', '<i>' . $i);
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/OpenNLP/Morpho.pm b/lib/KorAP/Index/OpenNLP/Morpho.pm
index 2de5042..7ebdd96 100644
--- a/lib/KorAP/Index/OpenNLP/Morpho.pm
+++ b/lib/KorAP/Index/OpenNLP/Morpho.pm

@@ -20,7 +20,7 @@
       if (($content->{-name} eq 'pos') && ($content->{'#text'})) {
 	$mtt->add(
 	  term => 'opennlp/p:' . $content->{'#text'}
-	);
+	) if $content->{'#text'};
       };
     }) or return;
 

diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
index 1ec1b60..fd0c9d3 100644
--- a/lib/KorAP/Index/OpenNLP/Sentences.pm
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm

@@ -12,7 +12,7 @@
       my ($stream, $span) = @_;
       my $mtt = $stream->pos($span->p_start);
       $mtt->add(
-	term => '<>:opennlp/s',
+	term => '<>:opennlp/s:s',
 	o_start => $span->o_start,
 	o_end => $span->o_end,
 	p_end => $span->p_end

diff --git a/lib/KorAP/Index/TreeTagger/Sentences.pm b/lib/KorAP/Index/TreeTagger/Sentences.pm
new file mode 100644
index 0000000..d96d96e
--- /dev/null
+++ b/lib/KorAP/Index/TreeTagger/Sentences.pm

@@ -0,0 +1,29 @@
+package KorAP::Index::TreeTagger::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+  my $i = 0;
+
+  $$self->add_spandata(
+    foundry => 'tree_tagger',
+    layer => 'sentences',
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:tt/s:s',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  ) or return;
+
+  $$self->stream->add_meta('tt/sentences', '<i>' . $i);
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/XIP/Constituency.pm b/lib/KorAP/Index/XIP/Constituency.pm
index a5edd28..f1a0615 100644
--- a/lib/KorAP/Index/XIP/Constituency.pm
+++ b/lib/KorAP/Index/XIP/Constituency.pm

@@ -98,7 +98,7 @@
     my $rel = $content->{rel};
 
     unless ($rel) {
-      warn $f->{-id} . ' has no relation';
+      warn $f->{-id} . ' has no relation' if $f->{-id};
       return;
     };
 
@@ -116,10 +116,9 @@
       next unless $target;
 
       my $subspan = delete $xip_const{$target};
-      unless ($subspan) {
-#	warn "Span " . $target . " not found";
-	return;
-      };
+      return unless $subspan;
+      #	warn "Span " . $target . " not found";
+
       $this->($subspan, $level + 1);
     };
   };

diff --git a/lib/KorAP/Index/XIP/Sentences.pm b/lib/KorAP/Index/XIP/Sentences.pm
new file mode 100644
index 0000000..f045152
--- /dev/null
+++ b/lib/KorAP/Index/XIP/Sentences.pm

@@ -0,0 +1,32 @@
+package KorAP::Index::XIP::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  my $i = 0;
+
+  $$self->add_spandata(
+    foundry => 'xip',
+    layer => 'sentences',
+    encoding => 'xip',
+    cb => sub {
+      my ($stream, $span) = @_;
+
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:xip/s:s',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  ) or return;
+
+  $$self->stream->add_meta('xip/sentences', '<i>' . $i);
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Indexer.pm b/lib/KorAP/Indexer.pm
new file mode 100644
index 0000000..ba65c5b
--- /dev/null
+++ b/lib/KorAP/Indexer.pm

@@ -0,0 +1,5 @@
+package KorAP::Indexer;
+
+our $VERSION = 0.02;
+
+1;

diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 97a9889..7b68947 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm

@@ -25,10 +25,6 @@
   return $log;
 };
 
-warn('IMPLEMENT AGGRESSIVE TOKENIZATION (trennen mit [-\'\s])');
-warn('In the payload the position of the partial token has to be marked, '.
-       'so the voodoo operator can do its thing');
-
 # Parse tokens of the document
 sub parse {
   my $self = shift;
@@ -103,6 +99,7 @@
       $range->gap($old, $from, $have) unless $old >= $from;
 
       # Add surface term
+      # That's always the first term!
       $mtt->add('s:' . $token);
 
       # Add case insensitive term
@@ -141,6 +138,59 @@
   return $self;
 };
 
+sub add_subtokens {
+  my $self = shift;
+  my $mtts = $self->stream or return;
+
+  foreach my $mtt (@{$mtts->multi_term_tokens}) {
+    my $o_start = $mtt->o_start;
+    my $o_end = $mtt->o_end;
+    my $l = $o_end - $o_start;
+
+    my $s = substr($mtt->lc_surface,2);
+    $s = 'einkaufs-zettel';
+    my $os = $s;
+
+    # Algorithm based on aggressive tokenization in
+    # tokenize.pl from Carsten Schnober
+    $s =~ s/[[:alpha:]]/a/g;
+    $s =~ s/[[:digit:]]/0/g;
+    $s =~ s/\p{Punct}/#/g;
+    $s =~ y/~/A/;
+    $s .= 'E';
+
+    while ($s =~ /(a+)[^a]/g) {
+      my $from = $-[1];
+      my $to = $+[1];
+      $mtt->add(
+	term => 'i^1:' . substr($os, $from, $from + $to),
+	o_start => $from + $o_start,
+	o_end => $to + $o_start
+      ) unless $to - $from == $l;
+    };
+    while ($s =~ /(0+)[^0]/g) {
+      my $from = $-[1];
+      my $to = $+[1];
+      $mtt->add(
+	term => 'i^2:' . substr($os, $from, $from + $to),
+	o_start => $from + $o_start,
+	o_end => $to + $o_start
+      ) unless $to - $from == $l;
+    };
+    while ($s =~ /(#)/g) {
+      my $from = $-[1];
+      my $to = $+[1];
+      $mtt->add(
+	term => 'i^3:' . substr($os, $from, $from + $to),
+	o_start => $from + $o_start,
+	o_end => $to + $o_start
+      ) unless $to - $from == $l;
+    };
+  };
+
+  return 1;
+};
+
 
 # Get span positions through character offsets
 sub range {
@@ -492,6 +542,19 @@
 Start the tokenization process.
 
 
+=head2 add_subtokens
+
+  $tokens->split_tokens;
+  $tokens->split_tokens(
+    sub {
+       ...
+    }
+  );
+
+Add sub token information to the index.
+This is based on the C<aggressive> tokenization, written by Carsten Schnober.
+
+
 =head2 add_spandata
 
   $tokens->add_spandata(
commit	f03c680ecc25127bdeea6ecd9bfac68cf02af912	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Mon Jul 21 16:39:44 2014 +0000
committer	Nils Diewald <nils@diewald-online.de>	Mon Jul 21 16:39:44 2014 +0000
tree	66d16134973fca13b6f4c781ce10922895c8f343
parent	ff6d078115bb1f2965fa7962c39e11a22f8d0df3 [diff]