Sentence annotations for all providing foundries and a beginning subtokenization based on cschnobers code
diff --git a/t/artificial-subtoken.t b/t/artificial-subtoken.t
new file mode 100644
index 0000000..ebf3b33
--- /dev/null
+++ b/t/artificial-subtoken.t
@@ -0,0 +1,65 @@
+#!/usr/bin/env perl
+# source ~/perl5/perlbrew/etc/bashrc
+# perlbrew switch perl-blead@korap
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use Benchmark ':hireswallclock';
+use lib 'lib', '../lib';
+use Scalar::Util qw/weaken/;
+
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+
+use_ok('KorAP::Document');
+
+my $path = catdir(dirname(__FILE__), 'artificial');
+ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
+is($doc->path, $path . '/', 'Path');
+ok($doc->parse, 'Parse document');
+
+sub new_tokenizer {
+ my $x = $doc;
+ weaken $x;
+ return KorAP::Tokenizer->new(
+ path => $x->path,
+ doc => $x,
+ foundry => 'OpenNLP',
+ layer => 'Tokens',
+ name => 'tokens'
+ )
+};
+
+is($doc->primary->data,
+ 'Zum letzten kulturellen Anlass lädt die Leitung des Schulheimes Hofbergli ein, '.
+ 'bevor der Betrieb Ende Schuljahr eingestellt wird.', 'Primary data');
+
+is($doc->primary->data_length, 129, 'Primary data length');
+
+is($doc->primary->data(0,3), 'Zum', 'Get primary data');
+
+# Get tokens
+use_ok('KorAP::Tokenizer');
+# Get tokenization
+ok(my $tokens = KorAP::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => 'OpenNLP',
+ layer => 'Tokens',
+ name => 'tokens'
+), 'New Tokenizer');
+ok($tokens->parse, 'Parse');
+
+ok($tokens->add_subtokens, 'Add subtokens');
+
+# diag $tokens->to_string;
+
+#foreach (@{$tokens->stream->multi_term_tokens}) {
+# print $_;
+#};
+
+done_testing;
+
+
+__END__
diff --git a/t/artificial.t b/t/artificial.t
index e4e5282..274e4ab 100644
--- a/t/artificial.t
+++ b/t/artificial.t
@@ -103,7 +103,7 @@
# Add OpenNLP/sentences
ok($tokens->add('OpenNLP', 'Sentences'), 'Add OpenNLP/Sentences');
-is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|opennlp/p:APPRART|<>:opennlp/s#0-129$<i>17|-:opennlp/sentences$<i>1]', 'Correct sentence');
+is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|opennlp/p:APPRART|<>:opennlp/s:s#0-129$<i>17|-:opennlp/sentences$<i>1]', 'Correct sentence');
# New instantiation
@@ -124,7 +124,7 @@
ok($tokens->add('Base', 'Paragraphs'), 'Add Base/Paragraphs');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:base/s#0-129$<i>17|<>:base/text#0-129$<i>17|-:base/sentences$<i>1|-:base/paragraphs$<i>0]',
+ '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:base/s:s#0-129$<i>17|<>:base/s:t#0-129$<i>17|-:base/sentences$<i>1|-:base/paragraphs$<i>0]',
'Correct base annotation');
@@ -161,7 +161,7 @@
ok($tokens->add('CoreNLP', 'Sentences'), 'Add CoreNLP/Sentences');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|corenlp/p:APPRART|<>:corenlp/s#0-129$<i>17|-:corenlp/sentences$<i>1]',
+ '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|corenlp/p:APPRART|<>:corenlp/s:s#0-129$<i>17|-:corenlp/sentences$<i>1]',
'Correct corenlp annotation');
@@ -172,7 +172,7 @@
ok($tokens->add('Connexor', 'Sentences'), 'Add Connexor/Sentences');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:cnx/s#0-129$<i>17|-:cnx/sentences$<i>1]',
+ '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:cnx/s:s#0-129$<i>17|-:cnx/sentences$<i>1]',
'Correct cnx annotation');
# New instantiation
@@ -242,7 +242,7 @@
# Add XIP/Sentences
ok($tokens->add('XIP', 'Sentences'), 'Add XIP/Sentences');
-is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:xip/s#0-129$<i>17|-:xip/sentences$<i>1]', 'First sentence');
+is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:xip/s:s#0-129$<i>17|-:xip/sentences$<i>1]', 'First sentence');
# Add XIP/Morpho
ok($tokens->add('XIP', 'Morpho'), 'Add XIP/Morpho');
diff --git a/t/artificial/opennlp/tokens.xml b/t/artificial/opennlp/tokens.xml
index d0bc237..b181a49 100644
--- a/t/artificial/opennlp/tokens.xml
+++ b/t/artificial/opennlp/tokens.xml
@@ -1,6 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?><?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="ART_00001" VERSION="KorAP-0.4">
<spanList>
- -43
<span id="s_7" from="0" to="3"/>
<span id="s_8" from="4" to="11"/>
<span id="s_9" from="12" to="23"/>
diff --git a/t/transform.t b/t/transform.t
index 3f351d7..093b5a4 100644
--- a/t/transform.t
+++ b/t/transform.t
@@ -105,13 +105,13 @@
# Add sentences
ok($tokens->add('Base', 'Sentences'), 'Add Sentences');
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s#0-74$<i>13|<>:base/text#0-6083$<i>923|-:base/sentences$<i>96]', 'Startinfo');
+is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96]', 'Startinfo');
foreach (@layers) {
ok($tokens->add(@$_), 'Add '. join(', ', @$_));
};
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s#0-74$<i>13|<>:base/text#0-6083$<i>923|-:base/sentences$<i>96|<>:base/para#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s#0-74$<i>13|-:xip/sentences$<i>64]', 'Startinfo');
+is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96|<>:base/s:p#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s:s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s:s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s:s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s:s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s:s#0-74$<i>13|-:xip/sentences$<i>64]', 'Startinfo');
#is($tokens->stream->pos(118)->to_string,