blob: ac72e68c3715046a96a327f5f51a415d451757cb [file] [log] [blame]
Akron8b511f92020-07-09 17:28:08 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
Akron8b511f92020-07-09 17:28:08 +02005use File::Spec::Functions qw/catfile/;
Marc Kupietz8ab68322026-03-18 18:04:14 +01006use File::Temp qw/tempfile/;
Akron8b511f92020-07-09 17:28:08 +02007use Test::XML::Loy;
8
9use FindBin;
Marc Kupietz52dc21b2020-09-05 13:51:22 +020010use utf8;
11
Akron8b511f92020-07-09 17:28:08 +020012BEGIN {
13 unshift @INC, "$FindBin::Bin/../lib";
14};
15
16require_ok('KorAP::XML::TEI::Tokenizer::External');
17
18my $f = dirname(__FILE__);
19my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
Marc Kupietz8ab68322026-03-18 18:04:14 +010020my $faulty_cmd = catfile($f, 'cmd', 'tokenizer_faulty.pl');
Akron8b511f92020-07-09 17:28:08 +020021
22# Test aggressive
23my $ext = KorAP::XML::TEI::Tokenizer::External->new(
24 'perl ' . $cmd
25 # 'java -cp Ingestion/target/KorAP-Ingestion-pipeline.jar de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl'
26);
27
28$ext->tokenize("Der alte Mann");
Akron8b511f92020-07-09 17:28:08 +020029my $str = $ext->to_string('unknown');
30my $t = Test::XML::Loy->new($str);
31$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
32$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
33$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
34$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
35$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
36$t->element_count_is('layer spanList span', 3);
37
Marc Kupietz52dc21b2020-09-05 13:51:22 +020038$ext->tokenize("ging über die Straße");
39$str = $ext->to_string('unknown');
40$t = Test::XML::Loy->new($str);
41$t->attr_is('layer spanList span:nth-child(1)', 'to', 4);
42$t->attr_is('layer spanList span:nth-child(2)', 'from', 5);
43$t->attr_is('layer spanList span:nth-child(2)', 'to', 9);
44$t->attr_is('layer spanList span:nth-child(3)', 'from', 10);
45$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
46$t->attr_is('layer spanList span:nth-child(4)', 'from', 14);
47$t->attr_is('layer spanList span:nth-child(4)', 'to', 20);
48$t->element_count_is('layer spanList span', 4);
49
Akron8b511f92020-07-09 17:28:08 +020050$ext->reset;
Marc Kupietz52dc21b2020-09-05 13:51:22 +020051$ext->tokenize("Hu aha\x{04}\ndas ist cool");
Akron8b511f92020-07-09 17:28:08 +020052
53$str = $ext->to_string('unknown');
54$t = Test::XML::Loy->new($str);
55$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
56$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
57$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
58$t->element_count_is('layer spanList span', 2);
59
Marc Kupietz8ab68322026-03-18 18:04:14 +010060my (undef, $state_file) = tempfile();
61
62$ext = KorAP::XML::TEI::Tokenizer::External->new(
63 "perl $faulty_cmd '$state_file'"
64);
65$ext->tokenize("Der __CRASH_ONCE__ Mann");
66$str = $ext->to_string('retry-doc');
67ok($str, 'Tokenization succeeds after restarting the external tokenizer');
68$t = Test::XML::Loy->new($str);
69$t->element_exists('layer spanList span:nth-child(1)', 'Retry produces token bounds');
70
71$ext->tokenize("Der __ALWAYS_CRASH__ Mann");
72ok(!defined $ext->to_string('skip-doc'), 'Tokenization can be skipped after repeated crashes');
73
74$ext->tokenize("Der alte Mann");
75$str = $ext->to_string('recovered-doc');
76ok($str, 'Tokenizer can continue after a skipped text');
77$t = Test::XML::Loy->new($str);
78$t->element_count_is('layer spanList span', 3);
Akron8b511f92020-07-09 17:28:08 +020079
80done_testing;