Establish collection object for token annotations
Change-Id: I03f9ff1f28301135b24dc111b7ef85c3af86a8e6
diff --git a/t/script.t b/t/script.t
index 6af3095..6ceadd9 100644
--- a/t/script.t
+++ b/t/script.t
@@ -459,5 +459,41 @@
;
};
+subtest 'Check Inline annotations with untagged file' => sub {
+
+ # Load example file
+ my $file = catfile($f, 'data', 'goe_sample.i5.xml');
+
+ my ($fh, $outzip) = korap_tempfile('script_untagged');
+
+ # Generate zip file (unportable!)
+ stderr_like(
+ sub { `cat '$file' | KORAPXMLTEI_INLINE=1 perl '$script' > '$outzip'` },
+ qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
+ 'Processing 1'
+ );
+
+ # TODO: there should be a better way to test this
+ stderr_unlike(
+ sub { `cat '$file' | KORAPXMLTEI_INLINE=1 perl '$script' > '$outzip'` },
+ qr!.*undefined value.*!,
+ 'Processing 2'
+ );
+ #
+
+ ok(-e $outzip, "File $outzip exists");
+
+ my $zip = IO::Uncompress::Unzip->new(
+ $outzip,
+ Name => 'GOE/AGA/00000/tokens/morpho.xml'
+ );
+ ok((not $zip), 'missing morpho.xml');
+
+ $zip = IO::Uncompress::Unzip->new(
+ $outzip,
+ Name => 'GOE/AGA/00000/struct/structure.xml'
+ );
+ ok($zip, 'found structure.xml');
+};
done_testing;
diff --git a/t/token.t b/t/token.t
new file mode 100644
index 0000000..85bab65
--- /dev/null
+++ b/t/token.t
@@ -0,0 +1,83 @@
+use strict;
+use warnings;
+use Test::More;
+use Test::XML::Loy;
+
+use FindBin;
+BEGIN {
+ unshift @INC, "$FindBin::Bin/../lib";
+};
+
+use_ok('KorAP::XML::TEI::Tokenizer::Token');
+
+subtest 'Initialization' => sub {
+ my $t = KorAP::XML::TEI::Tokenizer::Token->new;
+
+ ok(!defined($t->from), 'Undefined from');
+ ok(!defined($t->to), 'Undefined to');
+ ok(!defined($t->level), 'Undefined level');
+
+ $t->add_attribute('foo' => 'bar');
+ $t->add_attribute('x' => 'y');
+ $t->set_from(7);
+ $t->set_to(5);
+ $t->set_from(4);
+
+ my $loy = Test::XML::Loy->new($t->to_string(3));
+
+ $loy->attr_is('span', 'id', 's3')
+ ->attr_is('span', 'from', 4)
+ ->attr_is('span', 'to', 5)
+ ->attr_is('span fs f', 'name', 'lex')
+ ->attr_is('span fs f fs f:nth-of-type(1)', 'name', 'foo')
+ ->text_is('span fs f fs f:nth-of-type(1)', 'bar')
+ ->attr_is('span fs f fs f:nth-of-type(2)', 'name', 'x')
+ ->text_is('span fs f fs f:nth-of-type(2)', 'y')
+ ;
+
+ is($t->from,4);
+ is($t->to,5);
+ is($t->level,undef);
+ $t->set_level(19);
+ is($t->level,19);
+
+ $loy = Test::XML::Loy->new($t->to_string(3));
+
+ $loy->attr_is('span', 'id', 's3')
+ ->attr_is('span', 'from', 4)
+ ->attr_is('span', 'to', 5)
+ ->attr_is('span', 'l', 19)
+ ;
+};
+
+
+subtest 'Test inline annotations' => sub {
+ my $t = KorAP::XML::TEI::Tokenizer::Token->new('x1', 0, 6);
+ $t->add_attribute('ana' => 'DET @PREMOD');
+ $t->add_attribute('lemma' => 'C & A');
+
+ my $loy = Test::XML::Loy->new($t->to_string(1));
+
+ $loy->attr_is('span', 'id', 's1')
+ ->attr_is('span', 'to', 6)
+ ->attr_is('span > fs > f > fs f:nth-of-type(1)', 'name', 'ana')
+ ->text_is('span > fs > f > fs f:nth-of-type(1)', 'DET @PREMOD')
+ ->attr_is('span > fs > f > fs f:nth-of-type(2)', 'name', 'lemma')
+ ->text_is('span > fs > f > fs f:nth-of-type(2)', 'C & A')
+ ;
+
+ $loy = Test::XML::Loy->new($t->to_string_with_inline_annotations(1));
+
+ $loy->attr_is('span', 'id', 's1')
+ ->attr_is('span', 'to', 6)
+ ->attr_is('span > fs > f > fs f:nth-of-type(1)', 'name', 'pos')
+ ->text_is('span > fs > f > fs f:nth-of-type(1)', 'DET')
+ ->attr_is('span > fs > f > fs f:nth-of-type(2)', 'name', 'msd')
+ ->text_is('span > fs > f > fs f:nth-of-type(2)', '@PREMOD')
+ ->attr_is('span > fs > f > fs f:nth-of-type(3)', 'name', 'lemma')
+ ->text_is('span > fs > f > fs f:nth-of-type(3)', 'C & A')
+};
+
+
+done_testing;
+
diff --git a/t/tokenization-collect.t b/t/tokenization-collect.t
new file mode 100644
index 0000000..952c4b7
--- /dev/null
+++ b/t/tokenization-collect.t
@@ -0,0 +1,46 @@
+use strict;
+use warnings;
+use Test::More;
+use Test::XML::Loy;
+
+use FindBin;
+BEGIN {
+ unshift @INC, "$FindBin::Bin/../lib";
+};
+
+use_ok('KorAP::XML::TEI::Tokenizer::Collector');
+
+my $t = KorAP::XML::TEI::Tokenizer::Collector->new;
+
+$t->add_token('x1',0,8);
+my $token = $t->add_token('x2',9,14,2);
+$t->add_token('x3',15,20);
+
+my $loy = Test::XML::Loy->new($token->to_string(2));
+
+$loy->attr_is('span', 'id', 's2')
+ ->attr_is('span', 'from', 9)
+ ->attr_is('span', 'to', 14)
+ ->attr_is('span', 'l', 2)
+ ->attr_is('span fs f', 'name', 'lex')
+ ;
+
+$loy = Test::XML::Loy->new($t->last_token->to_string(3));
+
+$loy->attr_is('span', 'id', 's3')
+ ->attr_is('span', 'from', 15)
+ ->attr_is('span', 'to', 20)
+ ->attr_is('span fs f', 'name', 'lex')
+;
+
+$loy = Test::XML::Loy->new($t->to_string('text', 0))
+ ->attr_is('layer', 'docid', 'text')
+ ->attr_is('span#s0', 'to', '8')
+ ->attr_is('span#s1', 'to', '14')
+ ->attr_is('span#s1', 'l', '2')
+ ->attr_is('span#s2', 'to', '20')
+;
+
+
+done_testing;
+
diff --git a/t/tokenization.t b/t/tokenization.t
index 92b7cc3..9d986a0 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -18,10 +18,15 @@
# Test aggressive
my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+ok($aggr->empty, 'Empty');
$aggr->tokenize("Der alte Mann");
+ok(!$aggr->empty, 'Not empty');
is_deeply($aggr, [0,3,4,8,9,13]);
-$aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
+$aggr->reset;
+ok($aggr->empty, 'Empty');
+
+$aggr->tokenize("Der alte bzw. der grau-melierte Mann");
is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
like(