Sentence annotations for all providing foundries and a beginning subtokenization based on cschnobers code
diff --git a/Changes b/Changes
index f66c20b..c29d80d 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.2 2014-07-21
+ - Sentence annotations for all providing foundries
+ - Starting subtokenization
+
0.1 2014-04-15
- [bugfix] for first token annotations
- Sentences are now available from all foundries that have it
diff --git a/Makefile.PL b/Makefile.PL
index 47450b1..3361ce4 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -5,10 +5,10 @@
use ExtUtils::MakeMaker;
WriteMakefile(
- NAME => 'KorAP::Bundle',
+ NAME => 'KorAP::Indexer',
AUTHOR => 'Nils Diewald',
ABSTRACT => 'Perl Implementation for Generating Multifoundry Lucene Indices',
- VERSION_FROM => 'lib/KorAP/Bundle.pm',
+ VERSION_FROM => 'lib/KorAP/Indexer.pm',
PREREQ_PM => {
'Mojolicious' => 4.51,
'Packed::Array' => 0.01,
diff --git a/lib/KorAP/Bundle.pm b/lib/KorAP/Bundle.pm
deleted file mode 100644
index 43997e3..0000000
--- a/lib/KorAP/Bundle.pm
+++ /dev/null
@@ -1,5 +0,0 @@
-package KorAP::Bundle;
-
-our $VERSION = 0.01;
-
-1;
diff --git a/lib/KorAP/Field/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
index 6e69704..6e43493 100644
--- a/lib/KorAP/Field/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm
@@ -24,6 +24,15 @@
return $mt;
};
+
+sub surface {
+ $_[0]->{mt}->[0]->term;
+};
+
+sub lc_surface {
+ $_[0]->{mt}->[1]->term;
+};
+
sub to_string {
my $self = shift;
my $string = '[(' . $self->o_start . '-'. $self->o_end . ')';
diff --git a/lib/KorAP/Field/MultiTermTokenStream.pm b/lib/KorAP/Field/MultiTermTokenStream.pm
index ea96a3e..f9e97a2 100644
--- a/lib/KorAP/Field/MultiTermTokenStream.pm
+++ b/lib/KorAP/Field/MultiTermTokenStream.pm
@@ -30,6 +30,10 @@
return join("\n" , map { $_->to_string } @{$self->{mtt}}) . "\n";
};
+sub multi_term_tokens {
+ $_[0]->{mtt};
+};
+
sub to_array {
my $self = shift;
[ map { $_->to_array } @{$self->{mtt}} ];
diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
index c2670bc..f0b6e6d 100644
--- a/lib/KorAP/Index/Base/Paragraphs.pm
+++ b/lib/KorAP/Index/Base/Paragraphs.pm
@@ -11,7 +11,7 @@
my ($stream, $span) = @_;
my $mtt = $stream->pos($span->p_start);
$mtt->add(
- term => '<>:base/para',
+ term => '<>:base/s:p',
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end
diff --git a/lib/KorAP/Index/Base/Sentences.pm b/lib/KorAP/Index/Base/Sentences.pm
index 8c3c296..a9fcd25 100644
--- a/lib/KorAP/Index/Base/Sentences.pm
+++ b/lib/KorAP/Index/Base/Sentences.pm
@@ -15,7 +15,7 @@
my $mtt = $stream->pos($span->p_start);
$first = [$span->p_start, $span->o_start] unless defined $first;
$mtt->add(
- term => '<>:base/s',
+ term => '<>:base/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end
@@ -28,7 +28,7 @@
my $mt = $$self->stream->pos($first->[0]);
$mt->add(
- term => '<>:base/text',
+ term => '<>:base/s:t',
o_start => $first->[1],
p_end => $last_p,
o_end => $last_o
diff --git a/lib/KorAP/Index/Connexor/Sentences.pm b/lib/KorAP/Index/Connexor/Sentences.pm
new file mode 100644
index 0000000..04cee09
--- /dev/null
+++ b/lib/KorAP/Index/Connexor/Sentences.pm
@@ -0,0 +1,29 @@
+package KorAP::Index::Connexor::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+ my $i = 0;
+
+ $$self->add_spandata(
+ foundry => 'connexor',
+ layer => 'sentences',
+ cb => sub {
+ my ($stream, $span) = @_;
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:cnx/s:s',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ $i++;
+ }
+ ) or return;
+
+ $$self->stream->add_meta('cnx/sentences', '<i>' . $i);
+
+ return 1;
+};
+
+1;
diff --git a/lib/KorAP/Index/CoreNLP/Constituency.pm b/lib/KorAP/Index/CoreNLP/Constituency.pm
new file mode 100644
index 0000000..4793bfd
--- /dev/null
+++ b/lib/KorAP/Index/CoreNLP/Constituency.pm
@@ -0,0 +1,85 @@
+package KorAP::Index::CoreNLP::Constituency;
+use KorAP::Index::Base;
+use Set::Scalar;
+use v5.16;
+
+sub parse {
+ my $self = shift;
+
+ # Collect all spans and check for roots
+ my %corenlp_const;
+ my $corenlp_const_root = Set::Scalar->new;
+ my $corenlp_const_noroot = Set::Scalar->new;
+
+ # First run:
+ $$self->add_spandata(
+ foundry => 'corenlp',
+ layer => 'constituency',
+ cb => sub {
+ my ($stream, $span) = @_;
+
+ $corenlp_const{$span->id} = $span;
+ $corenlp_const_root->insert($span->id);
+
+ my $rel = $span->hash->{rel} or return;
+ $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+ foreach (@$rel) {
+ if ($_->{-label} eq 'dominates' && $_->{-target}) {
+ $corenlp_const_noroot->insert($_->{-target});
+ };
+ };
+ }
+ ) or return;
+
+ my $stream = $$self->stream;
+
+ my $add_const = sub {
+ my $span = shift;
+ my $level = shift;
+ my $mtt = $stream->pos($span->p_start);
+
+ my $content = $span->hash;
+ my $f = $content->{fs}->{f};
+ return unless $f->{-name} eq 'const';
+
+ my $type = $f->{'#text'} or return;
+
+ # $type is now NPA, NP, NUM ...
+ my %term = (
+ term => '<>:corenlp/c:' . $type,
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+
+ $term{payload} = '<b>' . $level if $level;
+
+ $mtt->add(%term);
+
+ my $this = __SUB__;
+
+ my $rel = $content->{rel} or return;
+ $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+ foreach (@$rel) {
+ next if $_->{-label} ne 'dominates' || !$_->{-target};
+ my $subspan = delete $corenlp_const{$_->{-target}} or return;
+ $this->($subspan, $level + 1);
+ };
+ };
+
+ my $diff = $corenlp_const_root->difference($corenlp_const_noroot);
+ foreach ($diff->members) {
+ my $obj = delete $corenlp_const{$_} or next;
+ $add_const->($obj, 0);
+ };
+
+ return 1;
+};
+
+sub layer_info {
+ ['corenlp/c=const']
+}
+
+1;
diff --git a/lib/KorAP/Index/CoreNLP/Sentences.pm b/lib/KorAP/Index/CoreNLP/Sentences.pm
new file mode 100644
index 0000000..1bd84e0
--- /dev/null
+++ b/lib/KorAP/Index/CoreNLP/Sentences.pm
@@ -0,0 +1,29 @@
+package KorAP::Index::CoreNLP::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+ my $i = 0;
+
+ $$self->add_spandata(
+ foundry => 'corenlp',
+ layer => 'sentences',
+ cb => sub {
+ my ($stream, $span) = @_;
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:corenlp/s:s',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ $i++;
+ }
+ ) or return;
+
+ $$self->stream->add_meta('corenlp/sentences', '<i>' . $i);
+
+ return 1;
+};
+
+1;
diff --git a/lib/KorAP/Index/OpenNLP/Morpho.pm b/lib/KorAP/Index/OpenNLP/Morpho.pm
index 2de5042..7ebdd96 100644
--- a/lib/KorAP/Index/OpenNLP/Morpho.pm
+++ b/lib/KorAP/Index/OpenNLP/Morpho.pm
@@ -20,7 +20,7 @@
if (($content->{-name} eq 'pos') && ($content->{'#text'})) {
$mtt->add(
term => 'opennlp/p:' . $content->{'#text'}
- );
+ ) if $content->{'#text'};
};
}) or return;
diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
index 1ec1b60..fd0c9d3 100644
--- a/lib/KorAP/Index/OpenNLP/Sentences.pm
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm
@@ -12,7 +12,7 @@
my ($stream, $span) = @_;
my $mtt = $stream->pos($span->p_start);
$mtt->add(
- term => '<>:opennlp/s',
+ term => '<>:opennlp/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end
diff --git a/lib/KorAP/Index/TreeTagger/Sentences.pm b/lib/KorAP/Index/TreeTagger/Sentences.pm
new file mode 100644
index 0000000..d96d96e
--- /dev/null
+++ b/lib/KorAP/Index/TreeTagger/Sentences.pm
@@ -0,0 +1,29 @@
+package KorAP::Index::TreeTagger::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+ my $i = 0;
+
+ $$self->add_spandata(
+ foundry => 'tree_tagger',
+ layer => 'sentences',
+ cb => sub {
+ my ($stream, $span) = @_;
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:tt/s:s',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ $i++;
+ }
+ ) or return;
+
+ $$self->stream->add_meta('tt/sentences', '<i>' . $i);
+
+ return 1;
+};
+
+1;
diff --git a/lib/KorAP/Index/XIP/Constituency.pm b/lib/KorAP/Index/XIP/Constituency.pm
index a5edd28..f1a0615 100644
--- a/lib/KorAP/Index/XIP/Constituency.pm
+++ b/lib/KorAP/Index/XIP/Constituency.pm
@@ -98,7 +98,7 @@
my $rel = $content->{rel};
unless ($rel) {
- warn $f->{-id} . ' has no relation';
+ warn $f->{-id} . ' has no relation' if $f->{-id};
return;
};
@@ -116,10 +116,9 @@
next unless $target;
my $subspan = delete $xip_const{$target};
- unless ($subspan) {
-# warn "Span " . $target . " not found";
- return;
- };
+ return unless $subspan;
+ # warn "Span " . $target . " not found";
+
$this->($subspan, $level + 1);
};
};
diff --git a/lib/KorAP/Index/XIP/Sentences.pm b/lib/KorAP/Index/XIP/Sentences.pm
new file mode 100644
index 0000000..f045152
--- /dev/null
+++ b/lib/KorAP/Index/XIP/Sentences.pm
@@ -0,0 +1,32 @@
+package KorAP::Index::XIP::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+
+ my $i = 0;
+
+ $$self->add_spandata(
+ foundry => 'xip',
+ layer => 'sentences',
+ encoding => 'xip',
+ cb => sub {
+ my ($stream, $span) = @_;
+
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:xip/s:s',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ $i++;
+ }
+ ) or return;
+
+ $$self->stream->add_meta('xip/sentences', '<i>' . $i);
+
+ return 1;
+};
+
+1;
diff --git a/lib/KorAP/Indexer.pm b/lib/KorAP/Indexer.pm
new file mode 100644
index 0000000..ba65c5b
--- /dev/null
+++ b/lib/KorAP/Indexer.pm
@@ -0,0 +1,5 @@
+package KorAP::Indexer;
+
+our $VERSION = 0.02;
+
+1;
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 97a9889..7b68947 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -25,10 +25,6 @@
return $log;
};
-warn('IMPLEMENT AGGRESSIVE TOKENIZATION (trennen mit [-\'\s])');
-warn('In the payload the position of the partial token has to be marked, '.
- 'so the voodoo operator can do its thing');
-
# Parse tokens of the document
sub parse {
my $self = shift;
@@ -103,6 +99,7 @@
$range->gap($old, $from, $have) unless $old >= $from;
# Add surface term
+ # That's always the first term!
$mtt->add('s:' . $token);
# Add case insensitive term
@@ -141,6 +138,59 @@
return $self;
};
+sub add_subtokens {
+ my $self = shift;
+ my $mtts = $self->stream or return;
+
+ foreach my $mtt (@{$mtts->multi_term_tokens}) {
+ my $o_start = $mtt->o_start;
+ my $o_end = $mtt->o_end;
+ my $l = $o_end - $o_start;
+
+ my $s = substr($mtt->lc_surface,2);
+ $s = 'einkaufs-zettel';
+ my $os = $s;
+
+ # Algorithm based on aggressive tokenization in
+ # tokenize.pl from Carsten Schnober
+ $s =~ s/[[:alpha:]]/a/g;
+ $s =~ s/[[:digit:]]/0/g;
+ $s =~ s/\p{Punct}/#/g;
+ $s =~ y/~/A/;
+ $s .= 'E';
+
+ while ($s =~ /(a+)[^a]/g) {
+ my $from = $-[1];
+ my $to = $+[1];
+ $mtt->add(
+ term => 'i^1:' . substr($os, $from, $from + $to),
+ o_start => $from + $o_start,
+ o_end => $to + $o_start
+ ) unless $to - $from == $l;
+ };
+ while ($s =~ /(0+)[^0]/g) {
+ my $from = $-[1];
+ my $to = $+[1];
+ $mtt->add(
+ term => 'i^2:' . substr($os, $from, $from + $to),
+ o_start => $from + $o_start,
+ o_end => $to + $o_start
+ ) unless $to - $from == $l;
+ };
+ while ($s =~ /(#)/g) {
+ my $from = $-[1];
+ my $to = $+[1];
+ $mtt->add(
+ term => 'i^3:' . substr($os, $from, $from + $to),
+ o_start => $from + $o_start,
+ o_end => $to + $o_start
+ ) unless $to - $from == $l;
+ };
+ };
+
+ return 1;
+};
+
# Get span positions through character offsets
sub range {
@@ -492,6 +542,19 @@
Start the tokenization process.
+=head2 add_subtokens
+
+ $tokens->split_tokens;
+ $tokens->split_tokens(
+ sub {
+ ...
+ }
+ );
+
+Add sub token information to the index.
+This is based on the C<aggressive> tokenization, written by Carsten Schnober.
+
+
=head2 add_spandata
$tokens->add_spandata(
diff --git a/t/artificial-subtoken.t b/t/artificial-subtoken.t
new file mode 100644
index 0000000..ebf3b33
--- /dev/null
+++ b/t/artificial-subtoken.t
@@ -0,0 +1,65 @@
+#!/usr/bin/env perl
+# source ~/perl5/perlbrew/etc/bashrc
+# perlbrew switch perl-blead@korap
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use Benchmark ':hireswallclock';
+use lib 'lib', '../lib';
+use Scalar::Util qw/weaken/;
+
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+
+use_ok('KorAP::Document');
+
+my $path = catdir(dirname(__FILE__), 'artificial');
+ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
+is($doc->path, $path . '/', 'Path');
+ok($doc->parse, 'Parse document');
+
+sub new_tokenizer {
+ my $x = $doc;
+ weaken $x;
+ return KorAP::Tokenizer->new(
+ path => $x->path,
+ doc => $x,
+ foundry => 'OpenNLP',
+ layer => 'Tokens',
+ name => 'tokens'
+ )
+};
+
+is($doc->primary->data,
+ 'Zum letzten kulturellen Anlass lädt die Leitung des Schulheimes Hofbergli ein, '.
+ 'bevor der Betrieb Ende Schuljahr eingestellt wird.', 'Primary data');
+
+is($doc->primary->data_length, 129, 'Primary data length');
+
+is($doc->primary->data(0,3), 'Zum', 'Get primary data');
+
+# Get tokens
+use_ok('KorAP::Tokenizer');
+# Get tokenization
+ok(my $tokens = KorAP::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => 'OpenNLP',
+ layer => 'Tokens',
+ name => 'tokens'
+), 'New Tokenizer');
+ok($tokens->parse, 'Parse');
+
+ok($tokens->add_subtokens, 'Add subtokens');
+
+# diag $tokens->to_string;
+
+#foreach (@{$tokens->stream->multi_term_tokens}) {
+# print $_;
+#};
+
+done_testing;
+
+
+__END__
diff --git a/t/artificial.t b/t/artificial.t
index e4e5282..274e4ab 100644
--- a/t/artificial.t
+++ b/t/artificial.t
@@ -103,7 +103,7 @@
# Add OpenNLP/sentences
ok($tokens->add('OpenNLP', 'Sentences'), 'Add OpenNLP/Sentences');
-is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|opennlp/p:APPRART|<>:opennlp/s#0-129$<i>17|-:opennlp/sentences$<i>1]', 'Correct sentence');
+is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|opennlp/p:APPRART|<>:opennlp/s:s#0-129$<i>17|-:opennlp/sentences$<i>1]', 'Correct sentence');
# New instantiation
@@ -124,7 +124,7 @@
ok($tokens->add('Base', 'Paragraphs'), 'Add Base/Paragraphs');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:base/s#0-129$<i>17|<>:base/text#0-129$<i>17|-:base/sentences$<i>1|-:base/paragraphs$<i>0]',
+ '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:base/s:s#0-129$<i>17|<>:base/s:t#0-129$<i>17|-:base/sentences$<i>1|-:base/paragraphs$<i>0]',
'Correct base annotation');
@@ -161,7 +161,7 @@
ok($tokens->add('CoreNLP', 'Sentences'), 'Add CoreNLP/Sentences');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|corenlp/p:APPRART|<>:corenlp/s#0-129$<i>17|-:corenlp/sentences$<i>1]',
+ '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|corenlp/p:APPRART|<>:corenlp/s:s#0-129$<i>17|-:corenlp/sentences$<i>1]',
'Correct corenlp annotation');
@@ -172,7 +172,7 @@
ok($tokens->add('Connexor', 'Sentences'), 'Add Connexor/Sentences');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:cnx/s#0-129$<i>17|-:cnx/sentences$<i>1]',
+ '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:cnx/s:s#0-129$<i>17|-:cnx/sentences$<i>1]',
'Correct cnx annotation');
# New instantiation
@@ -242,7 +242,7 @@
# Add XIP/Sentences
ok($tokens->add('XIP', 'Sentences'), 'Add XIP/Sentences');
-is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:xip/s#0-129$<i>17|-:xip/sentences$<i>1]', 'First sentence');
+is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:xip/s:s#0-129$<i>17|-:xip/sentences$<i>1]', 'First sentence');
# Add XIP/Morpho
ok($tokens->add('XIP', 'Morpho'), 'Add XIP/Morpho');
diff --git a/t/artificial/opennlp/tokens.xml b/t/artificial/opennlp/tokens.xml
index d0bc237..b181a49 100644
--- a/t/artificial/opennlp/tokens.xml
+++ b/t/artificial/opennlp/tokens.xml
@@ -1,6 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?><?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="ART_00001" VERSION="KorAP-0.4">
<spanList>
- -43
<span id="s_7" from="0" to="3"/>
<span id="s_8" from="4" to="11"/>
<span id="s_9" from="12" to="23"/>
diff --git a/t/transform.t b/t/transform.t
index 3f351d7..093b5a4 100644
--- a/t/transform.t
+++ b/t/transform.t
@@ -105,13 +105,13 @@
# Add sentences
ok($tokens->add('Base', 'Sentences'), 'Add Sentences');
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s#0-74$<i>13|<>:base/text#0-6083$<i>923|-:base/sentences$<i>96]', 'Startinfo');
+is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96]', 'Startinfo');
foreach (@layers) {
ok($tokens->add(@$_), 'Add '. join(', ', @$_));
};
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s#0-74$<i>13|<>:base/text#0-6083$<i>923|-:base/sentences$<i>96|<>:base/para#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s#0-74$<i>13|-:xip/sentences$<i>64]', 'Startinfo');
+is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96|<>:base/s:p#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s:s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s:s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s:s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s:s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s:s#0-74$<i>13|-:xip/sentences$<i>64]', 'Startinfo');
#is($tokens->stream->pos(118)->to_string,