Handle tokenizer crashes more gracefully
Change-Id: I6b7300fd81e19ec608d892331efcdcea5611dfbc
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index e0f4f29..17a9bed 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -13,7 +13,9 @@
# to STDIN and reads boundary data from STDOUT.
use constant {
- WAIT_SECS => 3600
+ WAIT_SECS => 3600,
+ RETRY_ATTEMPTS => 1,
+ LOG_SNIPPET_CHARS => 120
};
@@ -41,8 +43,9 @@
select => undef,
sep => $sep,
sentence_split => undef,
+ last_input => undef,
sentence_starts => [],
- sentence_ends => [],
+ sentence_endss => [],
}, $class;
# Initialize tokenizer
@@ -55,6 +58,9 @@
sub tokenize {
my ($self, $txt) = @_;
return unless $self->{pid};
+ $self->{last_input} = $txt;
+ $self->{sentence_starts} = [];
+ $self->{sentence_endss} = [];
my $out = $self->{chld_in};
print $out encode('UTF-8', $txt) . $self->{sep};
return $self;
@@ -64,6 +70,7 @@
# Initialize the tokenizer and bind the communication
sub _init {
my $self = shift;
+ $self->{select} = undef;
# Open process
if ($self->{pid} = open2(
@@ -100,74 +107,153 @@
return;
};
- return '' unless $self->{select};
+ for (my $attempt = 1; $attempt <= RETRY_ATTEMPTS + 1; $attempt++) {
+ my $output = eval { $self->_to_string_once($text_id) };
+ return $output unless $@;
- # Start header
- my $output = $self->_header($text_id);
+ my $err = $@;
+ chomp $err;
- # Wait 60m for the external tokenizer
- if ($self->{select}->can_read(WAIT_SECS)) {
+ if ($attempt <= RETRY_ATTEMPTS && defined $self->{last_input}) {
+ $log->warn(
+ "External tokenizer failed for '$text_id' on attempt $attempt/" . (RETRY_ATTEMPTS + 1) .
+ ' (' . $self->_input_context . "): $err. Restarting tokenizer and retrying"
+ );
- my $out = $self->{chld_out};
- $_ = <$out>;
- my @bounds = split;
-
- if ($self->{sentence_split}) {
- # sentence boundaries will be on a second line
- $_ = <$out>;
- my @sentence_bounds = split;
-
- # Save all sentence bounds
- for (my $i = 0; $i < @sentence_bounds; $i += 2 ) {
- push @{$self->{sentence_starts}}, $sentence_bounds[$i];
- push @{$self->{sentence_endss}}, $sentence_bounds[$i+1];
- };
- }
-
- # Serialize all bounds
- my $c = 0;
- for (my $i = 0; $i < @bounds; $i += 2 ){
- unless (looks_like_number($bounds[$i]) && looks_like_number($bounds[$i+1])) {
- die $log->fatal("Token bounds not numerical from external tokenizer ('$text_id')");
- };
- $output .= qq! <span id="t_$c" from="! . $bounds[$i] . '" to="' .
- $bounds[$i+1] . qq!" />\n!;
- $c++;
+ $self->reset;
+ last unless $self->{pid};
+ $self->tokenize($self->{last_input});
+ next;
};
- while ($self->{select}->can_read(0)) {
- $_ = <$out>;
+ $log->error(
+ "Skipping tokenization for '$text_id' after $attempt/" . (RETRY_ATTEMPTS + 1) .
+ ' attempts (' . $self->_input_context . "): $err"
+ );
+ $self->reset;
+ return;
+ };
- if (defined $_ && $_ ne '') {
+ return;
+};
- # This warning is sometimes thrown, though not yet replicated
- # in the test suite. See the discussion in gerrit (3123:
- # Establish tokenizer object for external base tokenization)
- # for further issues.
- $log->warn("Extra output: $_");
- }
- else {
- $log->warn('Tokenizer seems to have crashed, restarting');
- $self->reset;
- };
+
+sub _to_string_once {
+ my ($self, $text_id) = @_;
+
+ die 'Tokenizer is not available' unless $self->{select};
+
+ my $output = $self->_header($text_id);
+ my ($bounds, $sentence_bounds) = $self->_read_bounds;
+
+ if ($self->{sentence_split}) {
+ for (my $i = 0; $i < @{$sentence_bounds}; $i += 2) {
+ push @{$self->{sentence_starts}}, $sentence_bounds->[$i];
+ push @{$self->{sentence_endss}}, $sentence_bounds->[$i + 1];
};
}
- else {
- die $log->fatal("Can\'t retrieve token bounds from external tokenizer ('$text_id')");
+ my $c = 0;
+ for (my $i = 0; $i < @{$bounds}; $i += 2) {
+ unless (
+ defined $bounds->[$i + 1] &&
+ looks_like_number($bounds->[$i]) &&
+ looks_like_number($bounds->[$i + 1])
+ ) {
+ die 'Token bounds not numerical';
+ };
+ $output .= qq! <span id="t_$c" from="! . $bounds->[$i] . '" to="' .
+ $bounds->[$i + 1] . qq!" />\n!;
+ $c++;
};
- # Add footer
+ $self->_drain_output;
return $output . $self->_footer;
};
+sub _read_bounds {
+ my $self = shift;
+
+ unless ($self->{select}->can_read(WAIT_SECS)) {
+ die "Timed out after " . WAIT_SECS . "s while waiting for token bounds";
+ };
+
+ my $out = $self->{chld_out};
+ my $bounds_line = <$out>;
+ unless (defined $bounds_line && $bounds_line ne '') {
+ die $self->_read_error('token bounds');
+ };
+
+ my @sentence_bounds;
+ if ($self->{sentence_split}) {
+ my $sentence_bounds_line = <$out>;
+ unless (defined $sentence_bounds_line && $sentence_bounds_line ne '') {
+ die $self->_read_error('sentence bounds');
+ };
+ @sentence_bounds = split ' ', $sentence_bounds_line;
+ };
+
+ return ([split(' ', $bounds_line)], \@sentence_bounds);
+};
+
+
+sub _drain_output {
+ my $self = shift;
+ my $out = $self->{chld_out};
+
+ while ($self->{select}->can_read(0)) {
+ my $line = <$out>;
+
+ if (defined $line && $line ne '') {
+
+ # This warning is sometimes thrown, though not yet replicated
+ # in the test suite. See the discussion in gerrit (3123:
+ # Establish tokenizer object for external base tokenization)
+ # for further issues.
+ $log->warn("Extra output from external tokenizer: $line");
+ }
+ else {
+ $log->warn('Tokenizer ended after responding, restarting for the next text');
+ $self->reset;
+ last;
+ };
+ };
+};
+
+
+sub _read_error {
+ my ($self, $what) = @_;
+ return "Reached EOF while reading $what from external tokenizer (pid=" .
+ ($self->{pid} // 'n/a') . ')';
+};
+
+
+sub _input_context {
+ my $self = shift;
+ my $text = $self->{last_input} // '';
+ return 'chars=' . length($text) . ', snippet="' . _snippet($text) . '"';
+};
+
+
+sub _snippet {
+ my $text = shift // '';
+ $text =~ s/\s+/ /g;
+ $text =~ s/"/\\"/g;
+ if (length($text) > LOG_SNIPPET_CHARS) {
+ return substr($text, 0, LOG_SNIPPET_CHARS - 3) . '...';
+ };
+ return $text;
+};
+
+
# Close communication channel
sub close {
my $self = shift;
- close($self->{chld_in});
- close($self->{chld_out});
+ close($self->{chld_in}) if defined $self->{chld_in};
+ close($self->{chld_out}) if defined $self->{chld_out};
$self->{chld_out} = $self->{chld_in} = undef;
+ $self->{select} = undef;
# Close the pid if still open
if ($self->{pid}) {
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 3d35077..f8a26c2 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -10,7 +10,7 @@
use File::Basename qw(dirname);
-use Encode qw(decode);
+use Encode qw(decode encode);
use FindBin;
BEGIN {
@@ -356,14 +356,22 @@
# Tokenize with external tokenizer
if ($ext_tok) {
- # Tokenize and output
- $ext_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
- $text_id_esc
- );
+ my $tokens_output = eval {
+ $ext_tok->tokenize($data->data)->to_string($text_id_esc);
+ };
- if ($use_tokenizer_sentence_splits) {
- $ext_tok->sentencize_from_previous_input($inline->structures);
+ if (my $err = $@) {
+ $err =~ s/\s+$//;
+ $log->error("Skipping external tokenization for '$text_id_esc': $err");
+ $ext_tok->reset;
+ }
+ elsif (defined $tokens_output) {
+ $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml")
+ ->print(encode('UTF-8', $tokens_output));
+
+ if ($use_tokenizer_sentence_splits) {
+ $ext_tok->sentencize_from_previous_input($inline->structures);
+ };
};
};
diff --git a/t/script.t b/t/script.t
index b892236..62b54ad 100644
--- a/t/script.t
+++ b/t/script.t
@@ -198,6 +198,72 @@
->element_count_is('spanList span', 227);
};
+subtest 'Skip text after repeated external tokenizer crash' => sub {
+
+ my $cmd = catfile($f, 'cmd', 'tokenizer_faulty.pl');
+ my ($fh, $testfile) = korap_tempfile('script_exttok_skip');
+
+ print {$fh} <<'XML';
+<?xml version="1.0" encoding="UTF-8"?>
+<idsCorpus>
+ <idsHeader type="corpus">
+ <fileDesc>
+ <titleStmt>
+ <korpusSigle>CORP</korpusSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <idsDoc version="1.0">
+ <idsHeader type="document">
+ <fileDesc>
+ <titleStmt>
+ <dokumentSigle>CORP/DOC</dokumentSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <idsText version="1.0">
+ <idsHeader type="text">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>CORP/DOC.00001</textSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <text>
+ stable text
+ </text>
+ </idsText>
+ <idsText version="1.0">
+ <idsHeader type="text">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>CORP/DOC.00002</textSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <text>
+ __ALWAYS_CRASH__ text
+ </text>
+ </idsText>
+ </idsDoc>
+</idsCorpus>
+XML
+ close($fh);
+
+ test_tei2korapxml(
+ file => $testfile,
+ param => "-tc='perl $cmd'",
+ tmp => 'script_exttok_skip'
+ )
+ ->stderr_like(qr!tei2korapxml:.*? text_id=CORP_DOC\.00001!)
+ ->stderr_like(qr!tei2korapxml:.*? text_id=CORP_DOC\.00002!)
+ ->stderr_like(qr!External tokenizer failed for 'CORP_DOC\.00002' on attempt 1/2!)
+ ->stderr_like(qr!Skipping tokenization for 'CORP_DOC\.00002' after 2/2 attempts!)
+ ->file_readable('CORP/DOC/00001/base/tokens.xml')
+ ->file_exists_not('CORP/DOC/00002/base/tokens.xml')
+ ->file_readable('CORP/DOC/00002/data.xml');
+};
+
subtest 'Check KorAP tokenizer for infinite loop bug' => sub {
my $file = catfile($f, 'data', 'korap_tokenizer_challenge.xml');
diff --git a/t/tokenization-external.t b/t/tokenization-external.t
index 874f0fe..ac72e68 100644
--- a/t/tokenization-external.t
+++ b/t/tokenization-external.t
@@ -3,6 +3,7 @@
use Test::More;
use File::Basename 'dirname';
use File::Spec::Functions qw/catfile/;
+use File::Temp qw/tempfile/;
use Test::XML::Loy;
use FindBin;
@@ -16,6 +17,7 @@
my $f = dirname(__FILE__);
my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+my $faulty_cmd = catfile($f, 'cmd', 'tokenizer_faulty.pl');
# Test aggressive
my $ext = KorAP::XML::TEI::Tokenizer::External->new(
@@ -55,5 +57,24 @@
$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
$t->element_count_is('layer spanList span', 2);
+my (undef, $state_file) = tempfile();
+
+$ext = KorAP::XML::TEI::Tokenizer::External->new(
+ "perl $faulty_cmd '$state_file'"
+);
+$ext->tokenize("Der __CRASH_ONCE__ Mann");
+$str = $ext->to_string('retry-doc');
+ok($str, 'Tokenization succeeds after restarting the external tokenizer');
+$t = Test::XML::Loy->new($str);
+$t->element_exists('layer spanList span:nth-child(1)', 'Retry produces token bounds');
+
+$ext->tokenize("Der __ALWAYS_CRASH__ Mann");
+ok(!defined $ext->to_string('skip-doc'), 'Tokenization can be skipped after repeated crashes');
+
+$ext->tokenize("Der alte Mann");
+$str = $ext->to_string('recovered-doc');
+ok($str, 'Tokenizer can continue after a skipped text');
+$t = Test::XML::Loy->new($str);
+$t->element_count_is('layer spanList span', 3);
done_testing;