blob: 874f0fea55b91c180a5cd58bf9c1c3f3c4e82c3c [file] [log] [blame]
Akron8b511f92020-07-09 17:28:08 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
Akron8b511f92020-07-09 17:28:08 +02005use File::Spec::Functions qw/catfile/;
Akron8b511f92020-07-09 17:28:08 +02006use Test::XML::Loy;
7
8use FindBin;
Marc Kupietz52dc21b2020-09-05 13:51:22 +02009use utf8;
10
Akron8b511f92020-07-09 17:28:08 +020011BEGIN {
12 unshift @INC, "$FindBin::Bin/../lib";
13};
14
15require_ok('KorAP::XML::TEI::Tokenizer::External');
16
17my $f = dirname(__FILE__);
18my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
19
20# Test aggressive
21my $ext = KorAP::XML::TEI::Tokenizer::External->new(
22 'perl ' . $cmd
23 # 'java -cp Ingestion/target/KorAP-Ingestion-pipeline.jar de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl'
24);
25
26$ext->tokenize("Der alte Mann");
Akron8b511f92020-07-09 17:28:08 +020027my $str = $ext->to_string('unknown');
28my $t = Test::XML::Loy->new($str);
29$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
30$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
31$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
32$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
33$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
34$t->element_count_is('layer spanList span', 3);
35
Marc Kupietz52dc21b2020-09-05 13:51:22 +020036$ext->tokenize("ging über die Straße");
37$str = $ext->to_string('unknown');
38$t = Test::XML::Loy->new($str);
39$t->attr_is('layer spanList span:nth-child(1)', 'to', 4);
40$t->attr_is('layer spanList span:nth-child(2)', 'from', 5);
41$t->attr_is('layer spanList span:nth-child(2)', 'to', 9);
42$t->attr_is('layer spanList span:nth-child(3)', 'from', 10);
43$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
44$t->attr_is('layer spanList span:nth-child(4)', 'from', 14);
45$t->attr_is('layer spanList span:nth-child(4)', 'to', 20);
46$t->element_count_is('layer spanList span', 4);
47
Akron8b511f92020-07-09 17:28:08 +020048$ext->reset;
Marc Kupietz52dc21b2020-09-05 13:51:22 +020049$ext->tokenize("Hu aha\x{04}\ndas ist cool");
Akron8b511f92020-07-09 17:28:08 +020050
51$str = $ext->to_string('unknown');
52$t = Test::XML::Loy->new($str);
53$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
54$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
55$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
56$t->element_count_is('layer spanList span', 2);
57
58
59done_testing;