blob: 742c656754e8809e993119d858fc892f22a2ca37 [file] [log] [blame]
Akron8b511f92020-07-09 17:28:08 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
Akron8b511f92020-07-09 17:28:08 +02005use File::Spec::Functions qw/catfile/;
Akron8b511f92020-07-09 17:28:08 +02006use Test::XML::Loy;
7
8use FindBin;
9BEGIN {
10 unshift @INC, "$FindBin::Bin/../lib";
11};
12
13require_ok('KorAP::XML::TEI::Tokenizer::External');
14
15my $f = dirname(__FILE__);
16my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
17
18# Test aggressive
19my $ext = KorAP::XML::TEI::Tokenizer::External->new(
20 'perl ' . $cmd
21 # 'java -cp Ingestion/target/KorAP-Ingestion-pipeline.jar de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl'
22);
23
24$ext->tokenize("Der alte Mann");
25# TODO:
26# see comments on $sep in 'lib/KorAP/XML/TEI/Tokenizer/External.pm'
27#$ext->tokenize("ging über die Straße");
28
29my $str = $ext->to_string('unknown');
30my $t = Test::XML::Loy->new($str);
31$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
32$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
33$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
34$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
35$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
36$t->element_count_is('layer spanList span', 3);
37
38$ext->reset;
39$ext->tokenize("Hu aha\ndas ist cool");
40
41$str = $ext->to_string('unknown');
42$t = Test::XML::Loy->new($str);
43$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
44$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
45$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
46$t->element_count_is('layer spanList span', 2);
47
48
49done_testing;