Sentence annotations for all providing foundries and a beginning subtokenization based on cschnobers code

commit: f03c680ecc25127bdeea6ecd9bfac68cf02af912 [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Mon Jul 21 16:39:44 2014 +0000
committer: Nils Diewald <nils@diewald-online.de> Mon Jul 21 16:39:44 2014 +0000
tree: 66d16134973fca13b6f4c781ce10922895c8f343
parent: ff6d078115bb1f2965fa7962c39e11a22f8d0df3 [diff]
diff --git a/Changes b/Changes
index f66c20b..c29d80d 100644
--- a/Changes
+++ b/Changes

@@ -1,3 +1,7 @@
+0.2 2014-07-21
+        - Sentence annotations for all providing foundries
+	- Starting subtokenization
+
 0.1 2014-04-15
 	- [bugfix] for first token annotations
         - Sentences are now available from all foundries that have it

diff --git a/Makefile.PL b/Makefile.PL
index 47450b1..3361ce4 100644
--- a/Makefile.PL
+++ b/Makefile.PL

@@ -5,10 +5,10 @@
 use ExtUtils::MakeMaker;
 
 WriteMakefile(
-  NAME         => 'KorAP::Bundle',
+  NAME         => 'KorAP::Indexer',
   AUTHOR       => 'Nils Diewald',
   ABSTRACT     => 'Perl Implementation for Generating Multifoundry Lucene Indices',
-  VERSION_FROM => 'lib/KorAP/Bundle.pm',
+  VERSION_FROM => 'lib/KorAP/Indexer.pm',
   PREREQ_PM => {
     'Mojolicious'    => 4.51,
     'Packed::Array'  => 0.01,

diff --git a/lib/KorAP/Bundle.pm b/lib/KorAP/Bundle.pm
deleted file mode 100644
index 43997e3..0000000
--- a/lib/KorAP/Bundle.pm
+++ /dev/null

@@ -1,5 +0,0 @@
-package KorAP::Bundle;
-
-our $VERSION = 0.01;
-
-1;

diff --git a/lib/KorAP/Field/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
index 6e69704..6e43493 100644
--- a/lib/KorAP/Field/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm

@@ -24,6 +24,15 @@
   return $mt;
 };
 
+
+sub surface {
+  $_[0]->{mt}->[0]->term;
+};
+
+sub lc_surface {
+  $_[0]->{mt}->[1]->term;
+};
+
 sub to_string {
   my $self = shift;
   my $string = '[(' . $self->o_start . '-'. $self->o_end . ')';

diff --git a/lib/KorAP/Field/MultiTermTokenStream.pm b/lib/KorAP/Field/MultiTermTokenStream.pm
index ea96a3e..f9e97a2 100644
--- a/lib/KorAP/Field/MultiTermTokenStream.pm
+++ b/lib/KorAP/Field/MultiTermTokenStream.pm

@@ -30,6 +30,10 @@
   return join("\n" , map { $_->to_string } @{$self->{mtt}}) . "\n";
 };
 
+sub multi_term_tokens {
+  $_[0]->{mtt};
+};
+
 sub to_array {
   my $self = shift;
   [ map { $_->to_array } @{$self->{mtt}} ];

diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
index c2670bc..f0b6e6d 100644
--- a/lib/KorAP/Index/Base/Paragraphs.pm
+++ b/lib/KorAP/Index/Base/Paragraphs.pm

@@ -11,7 +11,7 @@
       my ($stream, $span) = @_;
       my $mtt = $stream->pos($span->p_start);
       $mtt->add(
-	term => '<>:base/para',
+	term => '<>:base/s:p',
 	o_start => $span->o_start,
 	o_end => $span->o_end,
 	p_end => $span->p_end

diff --git a/lib/KorAP/Index/Base/Sentences.pm b/lib/KorAP/Index/Base/Sentences.pm
index 8c3c296..a9fcd25 100644
--- a/lib/KorAP/Index/Base/Sentences.pm
+++ b/lib/KorAP/Index/Base/Sentences.pm

@@ -15,7 +15,7 @@
       my $mtt = $stream->pos($span->p_start);
       $first = [$span->p_start, $span->o_start] unless defined $first;
       $mtt->add(
-	term => '<>:base/s',
+	term => '<>:base/s:s',
 	o_start => $span->o_start,
 	o_end => $span->o_end,
 	p_end => $span->p_end
@@ -28,7 +28,7 @@
 
   my $mt = $$self->stream->pos($first->[0]);
   $mt->add(
-    term => '<>:base/text',
+    term => '<>:base/s:t',
     o_start => $first->[1],
     p_end => $last_p,
     o_end => $last_o

diff --git a/lib/KorAP/Index/Connexor/Sentences.pm b/lib/KorAP/Index/Connexor/Sentences.pm
new file mode 100644
index 0000000..04cee09
--- /dev/null
+++ b/lib/KorAP/Index/Connexor/Sentences.pm

@@ -0,0 +1,29 @@
+package KorAP::Index::Connexor::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+  my $i = 0;
+
+  $$self->add_spandata(
+    foundry => 'connexor',
+    layer => 'sentences',
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:cnx/s:s',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  ) or return;
+
+  $$self->stream->add_meta('cnx/sentences', '<i>' . $i);
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/CoreNLP/Constituency.pm b/lib/KorAP/Index/CoreNLP/Constituency.pm
new file mode 100644
index 0000000..4793bfd
--- /dev/null
+++ b/lib/KorAP/Index/CoreNLP/Constituency.pm

@@ -0,0 +1,85 @@
+package KorAP::Index::CoreNLP::Constituency;
+use KorAP::Index::Base;
+use Set::Scalar;
+use v5.16;
+
+sub parse {
+  my $self = shift;
+
+  # Collect all spans and check for roots
+  my %corenlp_const;
+  my $corenlp_const_root = Set::Scalar->new;
+  my $corenlp_const_noroot = Set::Scalar->new;
+
+  # First run:
+  $$self->add_spandata(
+    foundry => 'corenlp',
+    layer => 'constituency',
+    cb => sub {
+      my ($stream, $span) = @_;
+
+      $corenlp_const{$span->id} = $span;
+      $corenlp_const_root->insert($span->id);
+
+      my $rel = $span->hash->{rel} or return;
+      $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+      foreach (@$rel) {
+	if ($_->{-label} eq 'dominates' && $_->{-target}) {
+	  $corenlp_const_noroot->insert($_->{-target});
+	};
+      };
+    }
+  ) or return;
+
+  my $stream = $$self->stream;
+
+  my $add_const = sub {
+    my $span = shift;
+    my $level = shift;
+    my $mtt = $stream->pos($span->p_start);
+
+    my $content = $span->hash;
+    my $f = $content->{fs}->{f};
+    return unless $f->{-name} eq 'const';
+
+    my $type = $f->{'#text'} or return;
+
+    # $type is now NPA, NP, NUM ...
+    my %term = (
+      term => '<>:corenlp/c:' . $type,
+      o_start => $span->o_start,
+      o_end => $span->o_end,
+      p_end => $span->p_end
+    );
+
+    $term{payload} = '<b>' . $level if $level;
+
+    $mtt->add(%term);
+
+    my $this = __SUB__;
+
+    my $rel = $content->{rel} or return;
+    $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+    foreach (@$rel) {
+      next if $_->{-label} ne 'dominates' || !$_->{-target};
+      my $subspan = delete $corenlp_const{$_->{-target}} or return;
+      $this->($subspan, $level + 1);
+    };
+  };
+
+  my $diff = $corenlp_const_root->difference($corenlp_const_noroot);
+  foreach ($diff->members) {
+    my $obj = delete $corenlp_const{$_} or next;
+    $add_const->($obj, 0);
+  };
+
+  return 1;
+};
+
+sub layer_info {
+    ['corenlp/c=const']
+}
+
+1;

diff --git a/lib/KorAP/Index/CoreNLP/Sentences.pm b/lib/KorAP/Index/CoreNLP/Sentences.pm
new file mode 100644
index 0000000..1bd84e0
--- /dev/null
+++ b/lib/KorAP/Index/CoreNLP/Sentences.pm

@@ -0,0 +1,29 @@
+package KorAP::Index::CoreNLP::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+  my $i = 0;
+
+  $$self->add_spandata(
+    foundry => 'corenlp',
+    layer => 'sentences',
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:corenlp/s:s',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  ) or return;
+
+  $$self->stream->add_meta('corenlp/sentences', '<i>' . $i);
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/OpenNLP/Morpho.pm b/lib/KorAP/Index/OpenNLP/Morpho.pm
index 2de5042..7ebdd96 100644
--- a/lib/KorAP/Index/OpenNLP/Morpho.pm
+++ b/lib/KorAP/Index/OpenNLP/Morpho.pm

@@ -20,7 +20,7 @@
       if (($content->{-name} eq 'pos') && ($content->{'#text'})) {
 	$mtt->add(
 	  term => 'opennlp/p:' . $content->{'#text'}
-	);
+	) if $content->{'#text'};
       };
     }) or return;
 

diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
index 1ec1b60..fd0c9d3 100644
--- a/lib/KorAP/Index/OpenNLP/Sentences.pm
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm

@@ -12,7 +12,7 @@
       my ($stream, $span) = @_;
       my $mtt = $stream->pos($span->p_start);
       $mtt->add(
-	term => '<>:opennlp/s',
+	term => '<>:opennlp/s:s',
 	o_start => $span->o_start,
 	o_end => $span->o_end,
 	p_end => $span->p_end

diff --git a/lib/KorAP/Index/TreeTagger/Sentences.pm b/lib/KorAP/Index/TreeTagger/Sentences.pm
new file mode 100644
index 0000000..d96d96e
--- /dev/null
+++ b/lib/KorAP/Index/TreeTagger/Sentences.pm

@@ -0,0 +1,29 @@
+package KorAP::Index::TreeTagger::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+  my $i = 0;
+
+  $$self->add_spandata(
+    foundry => 'tree_tagger',
+    layer => 'sentences',
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:tt/s:s',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  ) or return;
+
+  $$self->stream->add_meta('tt/sentences', '<i>' . $i);
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/XIP/Constituency.pm b/lib/KorAP/Index/XIP/Constituency.pm
index a5edd28..f1a0615 100644
--- a/lib/KorAP/Index/XIP/Constituency.pm
+++ b/lib/KorAP/Index/XIP/Constituency.pm

@@ -98,7 +98,7 @@
     my $rel = $content->{rel};
 
     unless ($rel) {
-      warn $f->{-id} . ' has no relation';
+      warn $f->{-id} . ' has no relation' if $f->{-id};
       return;
     };
 
@@ -116,10 +116,9 @@
       next unless $target;
 
       my $subspan = delete $xip_const{$target};
-      unless ($subspan) {
-#	warn "Span " . $target . " not found";
-	return;
-      };
+      return unless $subspan;
+      #	warn "Span " . $target . " not found";
+
       $this->($subspan, $level + 1);
     };
   };

diff --git a/lib/KorAP/Index/XIP/Sentences.pm b/lib/KorAP/Index/XIP/Sentences.pm
new file mode 100644
index 0000000..f045152
--- /dev/null
+++ b/lib/KorAP/Index/XIP/Sentences.pm

@@ -0,0 +1,32 @@
+package KorAP::Index::XIP::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  my $i = 0;
+
+  $$self->add_spandata(
+    foundry => 'xip',
+    layer => 'sentences',
+    encoding => 'xip',
+    cb => sub {
+      my ($stream, $span) = @_;
+
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:xip/s:s',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  ) or return;
+
+  $$self->stream->add_meta('xip/sentences', '<i>' . $i);
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Indexer.pm b/lib/KorAP/Indexer.pm
new file mode 100644
index 0000000..ba65c5b
--- /dev/null
+++ b/lib/KorAP/Indexer.pm

@@ -0,0 +1,5 @@
+package KorAP::Indexer;
+
+our $VERSION = 0.02;
+
+1;

diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 97a9889..7b68947 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm

@@ -25,10 +25,6 @@
   return $log;
 };
 
-warn('IMPLEMENT AGGRESSIVE TOKENIZATION (trennen mit [-\'\s])');
-warn('In the payload the position of the partial token has to be marked, '.
-       'so the voodoo operator can do its thing');
-
 # Parse tokens of the document
 sub parse {
   my $self = shift;
@@ -103,6 +99,7 @@
       $range->gap($old, $from, $have) unless $old >= $from;
 
       # Add surface term
+      # That's always the first term!
       $mtt->add('s:' . $token);
 
       # Add case insensitive term
@@ -141,6 +138,59 @@
   return $self;
 };
 
+sub add_subtokens {
+  my $self = shift;
+  my $mtts = $self->stream or return;
+
+  foreach my $mtt (@{$mtts->multi_term_tokens}) {
+    my $o_start = $mtt->o_start;
+    my $o_end = $mtt->o_end;
+    my $l = $o_end - $o_start;
+
+    my $s = substr($mtt->lc_surface,2);
+    $s = 'einkaufs-zettel';
+    my $os = $s;
+
+    # Algorithm based on aggressive tokenization in
+    # tokenize.pl from Carsten Schnober
+    $s =~ s/[[:alpha:]]/a/g;
+    $s =~ s/[[:digit:]]/0/g;
+    $s =~ s/\p{Punct}/#/g;
+    $s =~ y/~/A/;
+    $s .= 'E';
+
+    while ($s =~ /(a+)[^a]/g) {
+      my $from = $-[1];
+      my $to = $+[1];
+      $mtt->add(
+	term => 'i^1:' . substr($os, $from, $from + $to),
+	o_start => $from + $o_start,
+	o_end => $to + $o_start
+      ) unless $to - $from == $l;
+    };
+    while ($s =~ /(0+)[^0]/g) {
+      my $from = $-[1];
+      my $to = $+[1];
+      $mtt->add(
+	term => 'i^2:' . substr($os, $from, $from + $to),
+	o_start => $from + $o_start,
+	o_end => $to + $o_start
+      ) unless $to - $from == $l;
+    };
+    while ($s =~ /(#)/g) {
+      my $from = $-[1];
+      my $to = $+[1];
+      $mtt->add(
+	term => 'i^3:' . substr($os, $from, $from + $to),
+	o_start => $from + $o_start,
+	o_end => $to + $o_start
+      ) unless $to - $from == $l;
+    };
+  };
+
+  return 1;
+};
+
 
 # Get span positions through character offsets
 sub range {
@@ -492,6 +542,19 @@
 Start the tokenization process.
 
 
+=head2 add_subtokens
+
+  $tokens->split_tokens;
+  $tokens->split_tokens(
+    sub {
+       ...
+    }
+  );
+
+Add sub token information to the index.
+This is based on the C<aggressive> tokenization, written by Carsten Schnober.
+
+
 =head2 add_spandata
 
   $tokens->add_spandata(

diff --git a/t/artificial-subtoken.t b/t/artificial-subtoken.t
new file mode 100644
index 0000000..ebf3b33
--- /dev/null
+++ b/t/artificial-subtoken.t

@@ -0,0 +1,65 @@
+#!/usr/bin/env perl
+# source ~/perl5/perlbrew/etc/bashrc
+# perlbrew switch perl-blead@korap
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use Benchmark ':hireswallclock';
+use lib 'lib', '../lib';
+use Scalar::Util qw/weaken/;
+
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+
+use_ok('KorAP::Document');
+
+my $path = catdir(dirname(__FILE__), 'artificial');
+ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
+is($doc->path, $path . '/', 'Path');
+ok($doc->parse, 'Parse document');
+
+sub new_tokenizer {
+  my $x = $doc;
+  weaken $x;
+  return KorAP::Tokenizer->new(
+    path => $x->path,
+    doc => $x,
+    foundry => 'OpenNLP',
+    layer => 'Tokens',
+    name => 'tokens'
+  )
+};
+
+is($doc->primary->data,
+   'Zum letzten kulturellen Anlass lädt die Leitung des Schulheimes Hofbergli ein, '.
+     'bevor der Betrieb Ende Schuljahr eingestellt wird.', 'Primary data');
+
+is($doc->primary->data_length, 129, 'Primary data length');
+
+is($doc->primary->data(0,3), 'Zum', 'Get primary data');
+
+# Get tokens
+use_ok('KorAP::Tokenizer');
+# Get tokenization
+ok(my $tokens = KorAP::Tokenizer->new(
+  path => $doc->path,
+  doc => $doc,
+  foundry => 'OpenNLP',
+  layer => 'Tokens',
+  name => 'tokens'
+), 'New Tokenizer');
+ok($tokens->parse, 'Parse');
+
+ok($tokens->add_subtokens, 'Add subtokens');
+
+# diag $tokens->to_string;
+
+#foreach (@{$tokens->stream->multi_term_tokens}) {
+#  print $_;
+#};
+
+done_testing;
+
+
+__END__

diff --git a/t/artificial.t b/t/artificial.t
index e4e5282..274e4ab 100644
--- a/t/artificial.t
+++ b/t/artificial.t

@@ -103,7 +103,7 @@
 # Add OpenNLP/sentences
 ok($tokens->add('OpenNLP', 'Sentences'), 'Add OpenNLP/Sentences');
 
-is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|opennlp/p:APPRART|<>:opennlp/s#0-129$<i>17|-:opennlp/sentences$<i>1]', 'Correct sentence');
+is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|opennlp/p:APPRART|<>:opennlp/s:s#0-129$<i>17|-:opennlp/sentences$<i>1]', 'Correct sentence');
 
 
 # New instantiation
@@ -124,7 +124,7 @@
 ok($tokens->add('Base', 'Paragraphs'), 'Add Base/Paragraphs');
 
 is($tokens->stream->pos(0)->to_string,
-   '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:base/s#0-129$<i>17|<>:base/text#0-129$<i>17|-:base/sentences$<i>1|-:base/paragraphs$<i>0]',
+   '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:base/s:s#0-129$<i>17|<>:base/s:t#0-129$<i>17|-:base/sentences$<i>1|-:base/paragraphs$<i>0]',
    'Correct base annotation');
 
 
@@ -161,7 +161,7 @@
 ok($tokens->add('CoreNLP', 'Sentences'), 'Add CoreNLP/Sentences');
 
 is($tokens->stream->pos(0)->to_string,
-   '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|corenlp/p:APPRART|<>:corenlp/s#0-129$<i>17|-:corenlp/sentences$<i>1]',
+   '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|corenlp/p:APPRART|<>:corenlp/s:s#0-129$<i>17|-:corenlp/sentences$<i>1]',
    'Correct corenlp annotation');
 
 
@@ -172,7 +172,7 @@
 ok($tokens->add('Connexor', 'Sentences'), 'Add Connexor/Sentences');
 
 is($tokens->stream->pos(0)->to_string,
-   '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:cnx/s#0-129$<i>17|-:cnx/sentences$<i>1]',
+   '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:cnx/s:s#0-129$<i>17|-:cnx/sentences$<i>1]',
    'Correct cnx annotation');
 
 # New instantiation
@@ -242,7 +242,7 @@
 # Add XIP/Sentences
 ok($tokens->add('XIP', 'Sentences'), 'Add XIP/Sentences');
 
-is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:xip/s#0-129$<i>17|-:xip/sentences$<i>1]', 'First sentence');
+is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:xip/s:s#0-129$<i>17|-:xip/sentences$<i>1]', 'First sentence');
 
 # Add XIP/Morpho
 ok($tokens->add('XIP', 'Morpho'), 'Add XIP/Morpho');

diff --git a/t/artificial/opennlp/tokens.xml b/t/artificial/opennlp/tokens.xml
index d0bc237..b181a49 100644
--- a/t/artificial/opennlp/tokens.xml
+++ b/t/artificial/opennlp/tokens.xml

@@ -1,6 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?><?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="ART_00001" VERSION="KorAP-0.4">
 <spanList>
-  -43
       <span id="s_7" from="0" to="3"/>
       <span id="s_8" from="4" to="11"/>
       <span id="s_9" from="12" to="23"/>

diff --git a/t/transform.t b/t/transform.t
index 3f351d7..093b5a4 100644
--- a/t/transform.t
+++ b/t/transform.t

@@ -105,13 +105,13 @@
 # Add sentences
 ok($tokens->add('Base', 'Sentences'), 'Add Sentences');
 
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s#0-74$<i>13|<>:base/text#0-6083$<i>923|-:base/sentences$<i>96]', 'Startinfo');
+is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96]', 'Startinfo');
 
 foreach (@layers) {
   ok($tokens->add(@$_), 'Add '. join(', ', @$_));
 };
 
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s#0-74$<i>13|<>:base/text#0-6083$<i>923|-:base/sentences$<i>96|<>:base/para#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s#0-74$<i>13|-:xip/sentences$<i>64]', 'Startinfo');
+is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96|<>:base/s:p#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s:s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s:s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s:s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s:s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s:s#0-74$<i>13|-:xip/sentences$<i>64]', 'Startinfo');
 
 
 #is($tokens->stream->pos(118)->to_string,
commit	f03c680ecc25127bdeea6ecd9bfac68cf02af912	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Mon Jul 21 16:39:44 2014 +0000
committer	Nils Diewald <nils@diewald-online.de>	Mon Jul 21 16:39:44 2014 +0000
tree	66d16134973fca13b6f4c781ce10922895c8f343
parent	ff6d078115bb1f2965fa7962c39e11a22f8d0df3 [diff]