Handle tokenizer crashes more gracefully
Change-Id: I6b7300fd81e19ec608d892331efcdcea5611dfbc
diff --git a/t/script.t b/t/script.t
index b892236..62b54ad 100644
--- a/t/script.t
+++ b/t/script.t
@@ -198,6 +198,72 @@
->element_count_is('spanList span', 227);
};
+subtest 'Skip text after repeated external tokenizer crash' => sub {
+
+ my $cmd = catfile($f, 'cmd', 'tokenizer_faulty.pl');
+ my ($fh, $testfile) = korap_tempfile('script_exttok_skip');
+
+ print {$fh} <<'XML';
+<?xml version="1.0" encoding="UTF-8"?>
+<idsCorpus>
+ <idsHeader type="corpus">
+ <fileDesc>
+ <titleStmt>
+ <korpusSigle>CORP</korpusSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <idsDoc version="1.0">
+ <idsHeader type="document">
+ <fileDesc>
+ <titleStmt>
+ <dokumentSigle>CORP/DOC</dokumentSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <idsText version="1.0">
+ <idsHeader type="text">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>CORP/DOC.00001</textSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <text>
+ stable text
+ </text>
+ </idsText>
+ <idsText version="1.0">
+ <idsHeader type="text">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>CORP/DOC.00002</textSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <text>
+ __ALWAYS_CRASH__ text
+ </text>
+ </idsText>
+ </idsDoc>
+</idsCorpus>
+XML
+ close($fh);
+
+ test_tei2korapxml(
+ file => $testfile,
+ param => "-tc='perl $cmd'",
+ tmp => 'script_exttok_skip'
+ )
+ ->stderr_like(qr!tei2korapxml:.*? text_id=CORP_DOC\.00001!)
+ ->stderr_like(qr!tei2korapxml:.*? text_id=CORP_DOC\.00002!)
+ ->stderr_like(qr!External tokenizer failed for 'CORP_DOC\.00002' on attempt 1/2!)
+ ->stderr_like(qr!Skipping tokenization for 'CORP_DOC\.00002' after 2/2 attempts!)
+ ->file_readable('CORP/DOC/00001/base/tokens.xml')
+ ->file_exists_not('CORP/DOC/00002/base/tokens.xml')
+ ->file_readable('CORP/DOC/00002/data.xml');
+};
+
subtest 'Check KorAP tokenizer for infinite loop bug' => sub {
my $file = catfile($f, 'data', 'korap_tokenizer_challenge.xml');