blob: e867aed43017a800cbcb10a182e11a4ea357ec13 [file] [log] [blame]
Akron8b511f92020-07-09 17:28:08 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use Data::Dumper;
6use File::Spec::Functions qw/catfile/;
7use File::Temp 'tempfile';
8use Test::XML::Loy;
9
10use FindBin;
11BEGIN {
12 unshift @INC, "$FindBin::Bin/../lib";
13};
14
15require_ok('KorAP::XML::TEI::Tokenizer::External');
16
17my $f = dirname(__FILE__);
18my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
19
20# Test aggressive
21my $ext = KorAP::XML::TEI::Tokenizer::External->new(
22 'perl ' . $cmd
23 # 'java -cp Ingestion/target/KorAP-Ingestion-pipeline.jar de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl'
24);
25
26$ext->tokenize("Der alte Mann");
27# TODO:
28# see comments on $sep in 'lib/KorAP/XML/TEI/Tokenizer/External.pm'
29#$ext->tokenize("ging über die Straße");
30
31my $str = $ext->to_string('unknown');
32my $t = Test::XML::Loy->new($str);
33$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
34$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
35$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
36$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
37$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
38$t->element_count_is('layer spanList span', 3);
39
40$ext->reset;
41$ext->tokenize("Hu aha\ndas ist cool");
42
43$str = $ext->to_string('unknown');
44$t = Test::XML::Loy->new($str);
45$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
46$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
47$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
48$t->element_count_is('layer spanList span', 2);
49
50
51done_testing;