Handle tokenizer crashes more gracefully Change-Id: I6b7300fd81e19ec608d892331efcdcea5611dfbc

commit: 8ab6832a25f718ff861c656db89988ed05963744 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Wed Mar 18 18:04:14 2026 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Wed Mar 18 18:04:14 2026 +0100
tree: 7332138c8635c4acb3da3cc8ef83e63335404630
parent: 32781e9a706f3230f12e7b006aef9d91af862065 [diff]
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index e0f4f29..17a9bed 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm

@@ -13,7 +13,9 @@
 # to STDIN and reads boundary data from STDOUT.
 
 use constant {
-  WAIT_SECS => 3600
+  WAIT_SECS          => 3600,
+  RETRY_ATTEMPTS    => 1,
+  LOG_SNIPPET_CHARS => 120
 };
 
 
@@ -41,8 +43,9 @@
       select          => undef,
       sep             => $sep,
       sentence_split  => undef,
+      last_input      => undef,
       sentence_starts => [],
-      sentence_ends   => [],
+      sentence_endss  => [],
   }, $class;
 
   # Initialize tokenizer
@@ -55,6 +58,9 @@
 sub tokenize {
   my ($self, $txt) = @_;
   return unless $self->{pid};
+  $self->{last_input} = $txt;
+  $self->{sentence_starts} = [];
+  $self->{sentence_endss} = [];
   my $out = $self->{chld_in};
   print $out encode('UTF-8', $txt) . $self->{sep};
   return $self;
@@ -64,6 +70,7 @@
 # Initialize the tokenizer and bind the communication
 sub _init {
   my $self = shift;
+  $self->{select} = undef;
 
   # Open process
   if ($self->{pid} = open2(
@@ -100,74 +107,153 @@
     return;
   };
 
-  return '' unless $self->{select};
+  for (my $attempt = 1; $attempt <= RETRY_ATTEMPTS + 1; $attempt++) {
+    my $output = eval { $self->_to_string_once($text_id) };
+    return $output unless $@;
 
-  # Start header
-  my $output = $self->_header($text_id);
+    my $err = $@;
+    chomp $err;
 
-  # Wait 60m for the external tokenizer
-  if ($self->{select}->can_read(WAIT_SECS)) {
+    if ($attempt <= RETRY_ATTEMPTS && defined $self->{last_input}) {
+      $log->warn(
+        "External tokenizer failed for '$text_id' on attempt $attempt/" . (RETRY_ATTEMPTS + 1) .
+        ' (' . $self->_input_context . "): $err. Restarting tokenizer and retrying"
+      );
 
-    my $out = $self->{chld_out};
-    $_ = <$out>;
-    my @bounds = split;
-
-    if ($self->{sentence_split}) {
-      # sentence boundaries will be on a second line
-      $_ = <$out>;
-      my @sentence_bounds = split;
-
-      # Save all sentence bounds
-      for (my $i = 0; $i < @sentence_bounds; $i +=  2 ) {
-        push @{$self->{sentence_starts}}, $sentence_bounds[$i];
-        push @{$self->{sentence_endss}}, $sentence_bounds[$i+1];
-      };
-    }
-
-    # Serialize all bounds
-    my $c = 0;
-    for (my $i = 0; $i < @bounds; $i +=  2 ){
-      unless (looks_like_number($bounds[$i]) && looks_like_number($bounds[$i+1])) {
-        die $log->fatal("Token bounds not numerical from external tokenizer ('$text_id')");
-      };
-      $output .= qq!    <span id="t_$c" from="! . $bounds[$i] . '" to="' .
-        $bounds[$i+1] . qq!" />\n!;
-      $c++;
+      $self->reset;
+      last unless $self->{pid};
+      $self->tokenize($self->{last_input});
+      next;
     };
 
-    while ($self->{select}->can_read(0)) {
-      $_ = <$out>;
+    $log->error(
+      "Skipping tokenization for '$text_id' after $attempt/" . (RETRY_ATTEMPTS + 1) .
+      ' attempts (' . $self->_input_context . "): $err"
+    );
+    $self->reset;
+    return;
+  };
 
-      if (defined $_ && $_ ne '') {
+  return;
+};
 
-        # This warning is sometimes thrown, though not yet replicated
-        # in the test suite. See the discussion in gerrit (3123:
-        # Establish tokenizer object for external base tokenization)
-        # for further issues.
-        $log->warn("Extra output: $_");
-      }
-      else {
-        $log->warn('Tokenizer seems to have crashed, restarting');
-        $self->reset;
-      };
+
+sub _to_string_once {
+  my ($self, $text_id) = @_;
+
+  die 'Tokenizer is not available' unless $self->{select};
+
+  my $output = $self->_header($text_id);
+  my ($bounds, $sentence_bounds) = $self->_read_bounds;
+
+  if ($self->{sentence_split}) {
+    for (my $i = 0; $i < @{$sentence_bounds}; $i += 2) {
+      push @{$self->{sentence_starts}}, $sentence_bounds->[$i];
+      push @{$self->{sentence_endss}}, $sentence_bounds->[$i + 1];
     };
   }
 
-  else {
-    die $log->fatal("Can\'t retrieve token bounds from external tokenizer ('$text_id')");
+  my $c = 0;
+  for (my $i = 0; $i < @{$bounds}; $i += 2) {
+    unless (
+      defined $bounds->[$i + 1] &&
+      looks_like_number($bounds->[$i]) &&
+      looks_like_number($bounds->[$i + 1])
+    ) {
+      die 'Token bounds not numerical';
+    };
+    $output .= qq!    <span id="t_$c" from="! . $bounds->[$i] . '" to="' .
+      $bounds->[$i + 1] . qq!" />\n!;
+    $c++;
   };
 
-  # Add footer
+  $self->_drain_output;
   return $output . $self->_footer;
 };
 
 
+sub _read_bounds {
+  my $self = shift;
+
+  unless ($self->{select}->can_read(WAIT_SECS)) {
+    die "Timed out after " . WAIT_SECS . "s while waiting for token bounds";
+  };
+
+  my $out = $self->{chld_out};
+  my $bounds_line = <$out>;
+  unless (defined $bounds_line && $bounds_line ne '') {
+    die $self->_read_error('token bounds');
+  };
+
+  my @sentence_bounds;
+  if ($self->{sentence_split}) {
+    my $sentence_bounds_line = <$out>;
+    unless (defined $sentence_bounds_line && $sentence_bounds_line ne '') {
+      die $self->_read_error('sentence bounds');
+    };
+    @sentence_bounds = split ' ', $sentence_bounds_line;
+  };
+
+  return ([split(' ', $bounds_line)], \@sentence_bounds);
+};
+
+
+sub _drain_output {
+  my $self = shift;
+  my $out = $self->{chld_out};
+
+  while ($self->{select}->can_read(0)) {
+    my $line = <$out>;
+
+    if (defined $line && $line ne '') {
+
+      # This warning is sometimes thrown, though not yet replicated
+      # in the test suite. See the discussion in gerrit (3123:
+      # Establish tokenizer object for external base tokenization)
+      # for further issues.
+      $log->warn("Extra output from external tokenizer: $line");
+    }
+    else {
+      $log->warn('Tokenizer ended after responding, restarting for the next text');
+      $self->reset;
+      last;
+    };
+  };
+};
+
+
+sub _read_error {
+  my ($self, $what) = @_;
+  return "Reached EOF while reading $what from external tokenizer (pid=" .
+    ($self->{pid} // 'n/a') . ')';
+};
+
+
+sub _input_context {
+  my $self = shift;
+  my $text = $self->{last_input} // '';
+  return 'chars=' . length($text) . ', snippet="' . _snippet($text) . '"';
+};
+
+
+sub _snippet {
+  my $text = shift // '';
+  $text =~ s/\s+/ /g;
+  $text =~ s/"/\\"/g;
+  if (length($text) > LOG_SNIPPET_CHARS) {
+    return substr($text, 0, LOG_SNIPPET_CHARS - 3) . '...';
+  };
+  return $text;
+};
+
+
 # Close communication channel
 sub close {
   my $self = shift;
-  close($self->{chld_in});
-  close($self->{chld_out});
+  close($self->{chld_in}) if defined $self->{chld_in};
+  close($self->{chld_out}) if defined $self->{chld_out};
   $self->{chld_out} = $self->{chld_in} = undef;
+  $self->{select} = undef;
 
   # Close the pid if still open
   if ($self->{pid}) {

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 3d35077..f8a26c2 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -10,7 +10,7 @@
 
 use File::Basename qw(dirname);
 
-use Encode qw(decode);
+use Encode qw(decode encode);
 
 use FindBin;
 BEGIN {
@@ -356,14 +356,22 @@
           # Tokenize with external tokenizer
           if ($ext_tok) {
 
-            # Tokenize and output
-            $ext_tok->tokenize($data->data)->to_zip(
-              $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
-              $text_id_esc
-            );
+            my $tokens_output = eval {
+              $ext_tok->tokenize($data->data)->to_string($text_id_esc);
+            };
 
-            if ($use_tokenizer_sentence_splits) {
-              $ext_tok->sentencize_from_previous_input($inline->structures);
+            if (my $err = $@) {
+              $err =~ s/\s+$//;
+              $log->error("Skipping external tokenization for '$text_id_esc': $err");
+              $ext_tok->reset;
+            }
+            elsif (defined $tokens_output) {
+              $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml")
+                ->print(encode('UTF-8', $tokens_output));
+
+              if ($use_tokenizer_sentence_splits) {
+                $ext_tok->sentencize_from_previous_input($inline->structures);
+              };
             };
           };
 

diff --git a/t/script.t b/t/script.t
index b892236..62b54ad 100644
--- a/t/script.t
+++ b/t/script.t

@@ -198,6 +198,72 @@
     ->element_count_is('spanList span', 227);
 };
 
+subtest 'Skip text after repeated external tokenizer crash' => sub {
+
+  my $cmd = catfile($f, 'cmd', 'tokenizer_faulty.pl');
+  my ($fh, $testfile) = korap_tempfile('script_exttok_skip');
+
+  print {$fh} <<'XML';
+<?xml version="1.0" encoding="UTF-8"?>
+<idsCorpus>
+  <idsHeader type="corpus">
+    <fileDesc>
+      <titleStmt>
+        <korpusSigle>CORP</korpusSigle>
+      </titleStmt>
+    </fileDesc>
+  </idsHeader>
+  <idsDoc version="1.0">
+    <idsHeader type="document">
+      <fileDesc>
+        <titleStmt>
+          <dokumentSigle>CORP/DOC</dokumentSigle>
+        </titleStmt>
+      </fileDesc>
+    </idsHeader>
+    <idsText version="1.0">
+      <idsHeader type="text">
+        <fileDesc>
+          <titleStmt>
+            <textSigle>CORP/DOC.00001</textSigle>
+          </titleStmt>
+        </fileDesc>
+      </idsHeader>
+      <text>
+        stable text
+      </text>
+    </idsText>
+    <idsText version="1.0">
+      <idsHeader type="text">
+        <fileDesc>
+          <titleStmt>
+            <textSigle>CORP/DOC.00002</textSigle>
+          </titleStmt>
+        </fileDesc>
+      </idsHeader>
+      <text>
+        __ALWAYS_CRASH__ text
+      </text>
+    </idsText>
+  </idsDoc>
+</idsCorpus>
+XML
+  close($fh);
+
+  test_tei2korapxml(
+    file => $testfile,
+    param => "-tc='perl $cmd'",
+    tmp => 'script_exttok_skip'
+  )
+    ->stderr_like(qr!tei2korapxml:.*? text_id=CORP_DOC\.00001!)
+    ->stderr_like(qr!tei2korapxml:.*? text_id=CORP_DOC\.00002!)
+    ->stderr_like(qr!External tokenizer failed for 'CORP_DOC\.00002' on attempt 1/2!)
+    ->stderr_like(qr!Skipping tokenization for 'CORP_DOC\.00002' after 2/2 attempts!)
+    ->file_readable('CORP/DOC/00001/base/tokens.xml')
+    ->file_exists_not('CORP/DOC/00002/base/tokens.xml')
+    ->file_readable('CORP/DOC/00002/data.xml');
+};
+
 subtest 'Check KorAP tokenizer for infinite loop bug' => sub {
 
   my $file = catfile($f, 'data', 'korap_tokenizer_challenge.xml');

diff --git a/t/tokenization-external.t b/t/tokenization-external.t
index 874f0fe..ac72e68 100644
--- a/t/tokenization-external.t
+++ b/t/tokenization-external.t

@@ -3,6 +3,7 @@
 use Test::More;
 use File::Basename 'dirname';
 use File::Spec::Functions qw/catfile/;
+use File::Temp qw/tempfile/;
 use Test::XML::Loy;
 
 use FindBin;
@@ -16,6 +17,7 @@
 
 my $f = dirname(__FILE__);
 my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+my $faulty_cmd = catfile($f, 'cmd', 'tokenizer_faulty.pl');
 
 # Test aggressive
 my $ext = KorAP::XML::TEI::Tokenizer::External->new(
@@ -55,5 +57,24 @@
 $t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
 $t->element_count_is('layer spanList span', 2);
 
+my (undef, $state_file) = tempfile();
+
+$ext = KorAP::XML::TEI::Tokenizer::External->new(
+  "perl $faulty_cmd '$state_file'"
+);
+$ext->tokenize("Der __CRASH_ONCE__ Mann");
+$str = $ext->to_string('retry-doc');
+ok($str, 'Tokenization succeeds after restarting the external tokenizer');
+$t = Test::XML::Loy->new($str);
+$t->element_exists('layer spanList span:nth-child(1)', 'Retry produces token bounds');
+
+$ext->tokenize("Der __ALWAYS_CRASH__ Mann");
+ok(!defined $ext->to_string('skip-doc'), 'Tokenization can be skipped after repeated crashes');
+
+$ext->tokenize("Der alte Mann");
+$str = $ext->to_string('recovered-doc');
+ok($str, 'Tokenizer can continue after a skipped text');
+$t = Test::XML::Loy->new($str);
+$t->element_count_is('layer spanList span', 3);
 
 done_testing;
commit	8ab6832a25f718ff861c656db89988ed05963744	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Mar 18 18:04:14 2026 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Mar 18 18:04:14 2026 +0100
tree	7332138c8635c4acb3da3cc8ef83e63335404630
parent	32781e9a706f3230f12e7b006aef9d91af862065 [diff]