Fix possible IO deadlocks with KorAP tokenizer
Text separators should always have a newline in front of artificial EOTs
to make sure they are recognized and to avoid them being consumed
by regular expressions for tokens.
Change-Id: I528c903904da50312a7472c7a34775476b0955be
diff --git a/t/data/korap_tokenizer_challenge.xml b/t/data/korap_tokenizer_challenge.xml
new file mode 100644
index 0000000..a37c3dc
--- /dev/null
+++ b/t/data/korap_tokenizer_challenge.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE idsCorpus>
+<idsCorpus version="1.0" TEIform="teiCorpus.2">
+ <idsDoc id="A00" type="text" version="1.0" TEIform="TEI.2">
+ <idsText id="WDD19.H0039.87242" n="de.Diskussion:Hilfswerk_der_Evangelischen_Kirchen_der_Schweiz" version="1">
+ <idsHeader type="text" pattern="text" version="1.0">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>WDD19/H0039.87242</textSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <text>
+ <body>
+ <div>
+ <p>?____</p>
+ </div>
+ </body>
+ </text>
+ </idsText>
+ </idsDoc>
+</idsCorpus>
diff --git a/t/script.t b/t/script.t
index fbe28bf..773bb60 100644
--- a/t/script.t
+++ b/t/script.t
@@ -175,6 +175,26 @@
->element_count_is('spanList span', 227);
};
+subtest 'Check KorAP tokenizer for infinite loop bug' => sub {
+
+ my $file = catfile($f, 'data', 'korap_tokenizer_challenge.xml');
+
+ eval {
+ require KorAP::XML::TEI::Tokenizer::KorAP;
+ 1;
+ } or do {
+ plan skip_all => "KorAP::XML::TEI::Tokenizer::KorAP cannot be used";
+ };
+
+ test_tei2korapxml(
+ file => $file,
+ param => "-tk -s",
+ tmp => 'script_bug_check'
+ )
+ ->stderr_like(qr!tei2korapxml: .*? text_id=WDD19_H0039\.87242!)
+ ->file_readable('WDD19/H0039/87242/struct/structure.xml');
+};
+
subtest 'Sentence split with KorAP tokenizer' => sub {
eval {