Handle tokenizer crashes more gracefully
Change-Id: I6b7300fd81e19ec608d892331efcdcea5611dfbc
diff --git a/t/script.t b/t/script.t
index b892236..62b54ad 100644
--- a/t/script.t
+++ b/t/script.t
@@ -198,6 +198,72 @@
->element_count_is('spanList span', 227);
};
+subtest 'Skip text after repeated external tokenizer crash' => sub {
+
+ my $cmd = catfile($f, 'cmd', 'tokenizer_faulty.pl');
+ my ($fh, $testfile) = korap_tempfile('script_exttok_skip');
+
+ print {$fh} <<'XML';
+<?xml version="1.0" encoding="UTF-8"?>
+<idsCorpus>
+ <idsHeader type="corpus">
+ <fileDesc>
+ <titleStmt>
+ <korpusSigle>CORP</korpusSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <idsDoc version="1.0">
+ <idsHeader type="document">
+ <fileDesc>
+ <titleStmt>
+ <dokumentSigle>CORP/DOC</dokumentSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <idsText version="1.0">
+ <idsHeader type="text">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>CORP/DOC.00001</textSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <text>
+ stable text
+ </text>
+ </idsText>
+ <idsText version="1.0">
+ <idsHeader type="text">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>CORP/DOC.00002</textSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <text>
+ __ALWAYS_CRASH__ text
+ </text>
+ </idsText>
+ </idsDoc>
+</idsCorpus>
+XML
+ close($fh);
+
+ test_tei2korapxml(
+ file => $testfile,
+ param => "-tc='perl $cmd'",
+ tmp => 'script_exttok_skip'
+ )
+ ->stderr_like(qr!tei2korapxml:.*? text_id=CORP_DOC\.00001!)
+ ->stderr_like(qr!tei2korapxml:.*? text_id=CORP_DOC\.00002!)
+ ->stderr_like(qr!External tokenizer failed for 'CORP_DOC\.00002' on attempt 1/2!)
+ ->stderr_like(qr!Skipping tokenization for 'CORP_DOC\.00002' after 2/2 attempts!)
+ ->file_readable('CORP/DOC/00001/base/tokens.xml')
+ ->file_exists_not('CORP/DOC/00002/base/tokens.xml')
+ ->file_readable('CORP/DOC/00002/data.xml');
+};
+
subtest 'Check KorAP tokenizer for infinite loop bug' => sub {
my $file = catfile($f, 'data', 'korap_tokenizer_challenge.xml');
diff --git a/t/tokenization-external.t b/t/tokenization-external.t
index 874f0fe..ac72e68 100644
--- a/t/tokenization-external.t
+++ b/t/tokenization-external.t
@@ -3,6 +3,7 @@
use Test::More;
use File::Basename 'dirname';
use File::Spec::Functions qw/catfile/;
+use File::Temp qw/tempfile/;
use Test::XML::Loy;
use FindBin;
@@ -16,6 +17,7 @@
my $f = dirname(__FILE__);
my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+my $faulty_cmd = catfile($f, 'cmd', 'tokenizer_faulty.pl');
# Test aggressive
my $ext = KorAP::XML::TEI::Tokenizer::External->new(
@@ -55,5 +57,24 @@
$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
$t->element_count_is('layer spanList span', 2);
+my (undef, $state_file) = tempfile();
+
+$ext = KorAP::XML::TEI::Tokenizer::External->new(
+ "perl $faulty_cmd '$state_file'"
+);
+$ext->tokenize("Der __CRASH_ONCE__ Mann");
+$str = $ext->to_string('retry-doc');
+ok($str, 'Tokenization succeeds after restarting the external tokenizer');
+$t = Test::XML::Loy->new($str);
+$t->element_exists('layer spanList span:nth-child(1)', 'Retry produces token bounds');
+
+$ext->tokenize("Der __ALWAYS_CRASH__ Mann");
+ok(!defined $ext->to_string('skip-doc'), 'Tokenization can be skipped after repeated crashes');
+
+$ext->tokenize("Der alte Mann");
+$str = $ext->to_string('recovered-doc');
+ok($str, 'Tokenizer can continue after a skipped text');
+$t = Test::XML::Loy->new($str);
+$t->element_count_is('layer spanList span', 3);
done_testing;