Sentence annotations for all providing foundries and a beginning subtokenization based on cschnobers code

commit: f03c680ecc25127bdeea6ecd9bfac68cf02af912 [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Mon Jul 21 16:39:44 2014 +0000
committer: Nils Diewald <nils@diewald-online.de> Mon Jul 21 16:39:44 2014 +0000
tree: 66d16134973fca13b6f4c781ce10922895c8f343
parent: ff6d078115bb1f2965fa7962c39e11a22f8d0df3 [diff]
diff --git a/t/artificial-subtoken.t b/t/artificial-subtoken.t
new file mode 100644
index 0000000..ebf3b33
--- /dev/null
+++ b/t/artificial-subtoken.t

@@ -0,0 +1,65 @@
+#!/usr/bin/env perl
+# source ~/perl5/perlbrew/etc/bashrc
+# perlbrew switch perl-blead@korap
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use Benchmark ':hireswallclock';
+use lib 'lib', '../lib';
+use Scalar::Util qw/weaken/;
+
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+
+use_ok('KorAP::Document');
+
+my $path = catdir(dirname(__FILE__), 'artificial');
+ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
+is($doc->path, $path . '/', 'Path');
+ok($doc->parse, 'Parse document');
+
+sub new_tokenizer {
+  my $x = $doc;
+  weaken $x;
+  return KorAP::Tokenizer->new(
+    path => $x->path,
+    doc => $x,
+    foundry => 'OpenNLP',
+    layer => 'Tokens',
+    name => 'tokens'
+  )
+};
+
+is($doc->primary->data,
+   'Zum letzten kulturellen Anlass lädt die Leitung des Schulheimes Hofbergli ein, '.
+     'bevor der Betrieb Ende Schuljahr eingestellt wird.', 'Primary data');
+
+is($doc->primary->data_length, 129, 'Primary data length');
+
+is($doc->primary->data(0,3), 'Zum', 'Get primary data');
+
+# Get tokens
+use_ok('KorAP::Tokenizer');
+# Get tokenization
+ok(my $tokens = KorAP::Tokenizer->new(
+  path => $doc->path,
+  doc => $doc,
+  foundry => 'OpenNLP',
+  layer => 'Tokens',
+  name => 'tokens'
+), 'New Tokenizer');
+ok($tokens->parse, 'Parse');
+
+ok($tokens->add_subtokens, 'Add subtokens');
+
+# diag $tokens->to_string;
+
+#foreach (@{$tokens->stream->multi_term_tokens}) {
+#  print $_;
+#};
+
+done_testing;
+
+
+__END__

diff --git a/t/artificial.t b/t/artificial.t
index e4e5282..274e4ab 100644
--- a/t/artificial.t
+++ b/t/artificial.t

@@ -103,7 +103,7 @@
 # Add OpenNLP/sentences
 ok($tokens->add('OpenNLP', 'Sentences'), 'Add OpenNLP/Sentences');
 
-is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|opennlp/p:APPRART|<>:opennlp/s#0-129$<i>17|-:opennlp/sentences$<i>1]', 'Correct sentence');
+is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|opennlp/p:APPRART|<>:opennlp/s:s#0-129$<i>17|-:opennlp/sentences$<i>1]', 'Correct sentence');
 
 
 # New instantiation
@@ -124,7 +124,7 @@
 ok($tokens->add('Base', 'Paragraphs'), 'Add Base/Paragraphs');
 
 is($tokens->stream->pos(0)->to_string,
-   '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:base/s#0-129$<i>17|<>:base/text#0-129$<i>17|-:base/sentences$<i>1|-:base/paragraphs$<i>0]',
+   '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:base/s:s#0-129$<i>17|<>:base/s:t#0-129$<i>17|-:base/sentences$<i>1|-:base/paragraphs$<i>0]',
    'Correct base annotation');
 
 
@@ -161,7 +161,7 @@
 ok($tokens->add('CoreNLP', 'Sentences'), 'Add CoreNLP/Sentences');
 
 is($tokens->stream->pos(0)->to_string,
-   '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|corenlp/p:APPRART|<>:corenlp/s#0-129$<i>17|-:corenlp/sentences$<i>1]',
+   '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|corenlp/p:APPRART|<>:corenlp/s:s#0-129$<i>17|-:corenlp/sentences$<i>1]',
    'Correct corenlp annotation');
 
 
@@ -172,7 +172,7 @@
 ok($tokens->add('Connexor', 'Sentences'), 'Add Connexor/Sentences');
 
 is($tokens->stream->pos(0)->to_string,
-   '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:cnx/s#0-129$<i>17|-:cnx/sentences$<i>1]',
+   '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:cnx/s:s#0-129$<i>17|-:cnx/sentences$<i>1]',
    'Correct cnx annotation');
 
 # New instantiation
@@ -242,7 +242,7 @@
 # Add XIP/Sentences
 ok($tokens->add('XIP', 'Sentences'), 'Add XIP/Sentences');
 
-is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:xip/s#0-129$<i>17|-:xip/sentences$<i>1]', 'First sentence');
+is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:xip/s:s#0-129$<i>17|-:xip/sentences$<i>1]', 'First sentence');
 
 # Add XIP/Morpho
 ok($tokens->add('XIP', 'Morpho'), 'Add XIP/Morpho');

diff --git a/t/artificial/opennlp/tokens.xml b/t/artificial/opennlp/tokens.xml
index d0bc237..b181a49 100644
--- a/t/artificial/opennlp/tokens.xml
+++ b/t/artificial/opennlp/tokens.xml

@@ -1,6 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?><?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="ART_00001" VERSION="KorAP-0.4">
 <spanList>
-  -43
       <span id="s_7" from="0" to="3"/>
       <span id="s_8" from="4" to="11"/>
       <span id="s_9" from="12" to="23"/>

diff --git a/t/transform.t b/t/transform.t
index 3f351d7..093b5a4 100644
--- a/t/transform.t
+++ b/t/transform.t

@@ -105,13 +105,13 @@
 # Add sentences
 ok($tokens->add('Base', 'Sentences'), 'Add Sentences');
 
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s#0-74$<i>13|<>:base/text#0-6083$<i>923|-:base/sentences$<i>96]', 'Startinfo');
+is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96]', 'Startinfo');
 
 foreach (@layers) {
   ok($tokens->add(@$_), 'Add '. join(', ', @$_));
 };
 
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s#0-74$<i>13|<>:base/text#0-6083$<i>923|-:base/sentences$<i>96|<>:base/para#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s#0-74$<i>13|-:xip/sentences$<i>64]', 'Startinfo');
+is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96|<>:base/s:p#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s:s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s:s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s:s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s:s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s:s#0-74$<i>13|-:xip/sentences$<i>64]', 'Startinfo');
 
 
 #is($tokens->stream->pos(118)->to_string,
commit	f03c680ecc25127bdeea6ecd9bfac68cf02af912	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Mon Jul 21 16:39:44 2014 +0000
committer	Nils Diewald <nils@diewald-online.de>	Mon Jul 21 16:39:44 2014 +0000
tree	66d16134973fca13b6f4c781ce10922895c8f343
parent	ff6d078115bb1f2965fa7962c39e11a22f8d0df3 [diff]