blob: a4c547edeb2c4d04f5326db7a2449a4d67b50ed5 [file] [log] [blame]
Marc Kupietz1e882fb2020-09-09 00:05:46 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catfile/;
6use Test::XML::Loy;
7
8use FindBin;
9use utf8;
10
11BEGIN {
12 eval {
13 require KorAP::XML::TEI::Tokenizer::KorAP;
14 1;
15 } or do {
16 plan skip_all => "KorAP::XML::TEI::Tokenizer::KorAP cannot be used";
17 };
18}
19
20require_ok('KorAP::XML::TEI::Tokenizer::KorAP');
21
22my $f = dirname(__FILE__);
23my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
24
25my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new();
26
27$ext->tokenize("Der alte Mann");
28my $str = $ext->to_string('unknown');
29my $t = Test::XML::Loy->new($str);
30$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
31$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
32$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
33$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
34$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
35$t->element_count_is('layer spanList span', 3);
36
37$ext->tokenize("ging über die Straße");
38$str = $ext->to_string('unknown');
39$t = Test::XML::Loy->new($str);
40$t->attr_is('layer spanList span:nth-child(1)', 'to', 4);
41$t->attr_is('layer spanList span:nth-child(2)', 'from', 5);
42$t->attr_is('layer spanList span:nth-child(2)', 'to', 9);
43$t->attr_is('layer spanList span:nth-child(3)', 'from', 10);
44$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
45$t->attr_is('layer spanList span:nth-child(4)', 'from', 14);
46$t->attr_is('layer spanList span:nth-child(4)', 'to', 20);
47$t->element_count_is('layer spanList span', 4);
48
49$ext->reset;
50$ext->tokenize("Hu aha\x{04}\ndas ist cool");
51$str = $ext->to_string('unknown');
52$t = Test::XML::Loy->new($str);
53$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
54$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
55$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
56$t->element_count_is('layer spanList span', 2);
57
58my $string = "Pluto.\" Eris-Entdecker Mike Brown, der im Kurznachrichtendienst Twitter unter \"\@plutokiller";
59$ext->reset;
60$ext->tokenize($string);
61$str = $ext->to_string('unknown');
62$t = Test::XML::Loy->new($str);
63$t->attr_is('layer spanList span:nth-child(14)', 'from', 80);
64$t->attr_is('layer spanList span:nth-child(14)', 'to', 92);
65$t->element_count_is('layer spanList span', 14);
Marc Kupietz7c88a7b2020-10-14 16:28:57 +020066
67$string = "Gefunden auf www.wikipedia.de";
68$ext->reset;
69$ext->tokenize($string);
70$str = $ext->to_string('unknown');
71$t = Test::XML::Loy->new($str);
72$t->attr_is('layer spanList span:nth-child(3)', 'from', 13);
73$t->attr_is('layer spanList span:nth-child(3)', 'to', 29);
74$t->element_count_is('layer spanList span', 3);
Marc Kupietz1e882fb2020-09-09 00:05:46 +020075done_testing;