blob: 0ca0719537ecb2178d9f12cfa1b222cdc3b10fb9 [file] [log] [blame]
Marc Kupietz1e882fb2020-09-09 00:05:46 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catfile/;
6use Test::XML::Loy;
7
8use FindBin;
9use utf8;
10
11BEGIN {
12 eval {
13 require KorAP::XML::TEI::Tokenizer::KorAP;
14 1;
15 } or do {
16 plan skip_all => "KorAP::XML::TEI::Tokenizer::KorAP cannot be used";
17 };
18}
19
Marc Kupietz985da0c2021-02-15 19:29:50 +010020use_ok('KorAP::XML::TEI::Annotations::Collector');
Marc Kupietz1e882fb2020-09-09 00:05:46 +020021require_ok('KorAP::XML::TEI::Tokenizer::KorAP');
22
23my $f = dirname(__FILE__);
24my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
25
Marc Kupietz985da0c2021-02-15 19:29:50 +010026my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new(1);
Marc Kupietz1e882fb2020-09-09 00:05:46 +020027
28$ext->tokenize("Der alte Mann");
29my $str = $ext->to_string('unknown');
30my $t = Test::XML::Loy->new($str);
31$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
32$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
33$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
34$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
35$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
36$t->element_count_is('layer spanList span', 3);
37
38$ext->tokenize("ging über die Straße");
39$str = $ext->to_string('unknown');
40$t = Test::XML::Loy->new($str);
41$t->attr_is('layer spanList span:nth-child(1)', 'to', 4);
42$t->attr_is('layer spanList span:nth-child(2)', 'from', 5);
43$t->attr_is('layer spanList span:nth-child(2)', 'to', 9);
44$t->attr_is('layer spanList span:nth-child(3)', 'from', 10);
45$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
46$t->attr_is('layer spanList span:nth-child(4)', 'from', 14);
47$t->attr_is('layer spanList span:nth-child(4)', 'to', 20);
48$t->element_count_is('layer spanList span', 4);
49
50$ext->reset;
51$ext->tokenize("Hu aha\x{04}\ndas ist cool");
52$str = $ext->to_string('unknown');
53$t = Test::XML::Loy->new($str);
54$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
55$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
56$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
57$t->element_count_is('layer spanList span', 2);
58
59my $string = "Pluto.\" Eris-Entdecker Mike Brown, der im Kurznachrichtendienst Twitter unter \"\@plutokiller";
60$ext->reset;
61$ext->tokenize($string);
62$str = $ext->to_string('unknown');
63$t = Test::XML::Loy->new($str);
64$t->attr_is('layer spanList span:nth-child(14)', 'from', 80);
65$t->attr_is('layer spanList span:nth-child(14)', 'to', 92);
66$t->element_count_is('layer spanList span', 14);
Marc Kupietz7c88a7b2020-10-14 16:28:57 +020067
Marc Kupietz985da0c2021-02-15 19:29:50 +010068my $structures = KorAP::XML::TEI::Annotations::Collector->new;
69$ext->sentencize_from_previous_input($structures);
70$t = Test::XML::Loy->new($structures->[-1]->to_string(3));
71$t->attr_is('span', 'from', 6)
72 ->attr_is('span', 'to', 92)
73 ->attr_is('span', 'l', -1, "sentence splitting with korap tokenizer");
74
Marc Kupietz7c88a7b2020-10-14 16:28:57 +020075$string = "Gefunden auf www.wikipedia.de";
76$ext->reset;
77$ext->tokenize($string);
78$str = $ext->to_string('unknown');
79$t = Test::XML::Loy->new($str);
80$t->attr_is('layer spanList span:nth-child(3)', 'from', 13);
81$t->attr_is('layer spanList span:nth-child(3)', 'to', 29);
82$t->element_count_is('layer spanList span', 3);
Marc Kupietz1e882fb2020-09-09 00:05:46 +020083done_testing;