Establish tokenizer object for external base tokenization Change-Id: Ie69c280042da5125e0934c87ccaad88b0be5494f

commit: 8b511f932a7e0f5d3869525b69006466b82c488d [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Jul 09 17:28:08 2020 +0200
committer: Peter Harders <harders@ids-mannheim.de> Fri Jul 10 20:52:27 2020 +0200
tree: 76ace32ccfecb3239b371e5131c91aadaaf321e0
parent: d962747a4ac7e02a6040fad736e4a8a45a6b4431 [diff]
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
new file mode 100644
index 0000000..8cfa0cf
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm

@@ -0,0 +1,165 @@
+package KorAP::XML::TEI::Tokenizer::External;
+use base 'KorAP::XML::TEI::Tokenizer';
+use strict;
+use warnings;
+use IO::Select;
+use IPC::Open2 qw(open2);
+
+# This tokenizer starts an external process for
+# tokenization. It writes the data to tokenize
+# to STDIN and reads boundary data from STDOUT.
+
+use constant {
+  WAIT_SECS => 3600
+};
+
+
+# Construct a new tokenizer.
+# Accepts the command to call the external tokenizer
+# and optionally a character sequence indicating the
+# end of an input.
+sub new {
+  my ($class, $cmd, $sep) = @_;
+
+  # e.g. 'java  -cp '. join(':', '.', glob(dirname(__FILE__) . "/../target/*.jar")).
+  #      " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl"
+
+  unless ($cmd) {
+    print STDERR "WARNING: tokenizer not established.\n";
+    return;
+  };
+
+  # Send this sequence to separate inputs
+  # TODO: needs to be explored furthermore ...
+  #   '\x03' produces a warning in 't/tokenization-external.t' (WARNING: extra output: 0 1)
+  #   - see discussion in gerrit (3123: Establish tokenizer object for external base tokenization)
+  #   an empty $sep leads to a blocking situation inside t/cmd/tokenizer.pl (right before the while-loop)
+  #$sep //= "\n\x03\n";
+  $sep //= "\n";
+
+  my $self = bless {
+    chld_in  => undef,
+    chld_out => undef,
+    pid      => undef,
+    cmd      => $cmd,
+    select   => undef,
+    sep      => $sep,
+  }, $class;
+
+  # Initialize tokenizer
+  $self->_init;
+  return $self;
+};
+
+
+# Tokenize text in an external process
+sub tokenize {
+  my ($self, $txt, $offset) = @_;
+  return unless $self->{pid};
+  my $out = $self->{chld_in};
+  print $out $txt . $self->{sep};
+};
+
+
+# Initialize the tokenizer and bind the communication
+sub _init {
+  my $self = shift;
+
+  # Open process
+  if ($self->{pid} = open2(
+    $self->{chld_out},
+    $self->{chld_in},
+    $self->{cmd}
+  )) {
+    $self->{select} = IO::Select->new;
+    $self->{select}->add(*{$self->{chld_out}});
+  }
+
+  else {
+    print STDERR "WARNING: tokenizer can't be started.\n";
+  };
+};
+
+
+# Reset the inner state of the tokenizer
+# and return the tokenizer object.
+sub reset {
+  my $self = shift;
+  $self->close;
+  $self->_init;
+  return $self;
+};
+
+
+# Return data as a string
+sub to_string {
+  my ($self, $text_id) = @_;
+
+  unless ($text_id) {
+    warn 'Missing textID';
+    return;
+  };
+
+  return '' unless $self->{select};
+
+  # Start header
+  my $output = $self->_header($text_id);
+
+  # TODO:
+  #   Escape the stringification of cmd.
+  $output .= '    <!-- ' . $self->{cmd} . " -->\n";
+
+  # Wait 60m for the external tokenizer
+  if ($self->{select}->can_read(WAIT_SECS)) {
+
+    my $out = $self->{chld_out};
+    $_ = <$out>;
+
+    my @bounds = split;
+
+    # Serialize all bounds
+    my $c = 0;
+    for (my $i = 0; $i < @bounds; $i +=  2 ){
+      $output .= qq!    <span id="t_$c" from="! . $bounds[$i] . '" to="' .
+        $bounds[$i+1] . qq!" />\n!;
+      $c++;
+    };
+
+    while ($self->{select}->can_read(0)) {
+      $_ = <$out>;
+
+      if (defined $_ && $_ ne '') {
+        print STDERR "WARNING: extra output: $_\n"
+      }
+      else {
+        print STDERR "WARNING: tokenizer seems to have crashed, restarting.\n";
+        $self->reset;
+      };
+    };
+  }
+
+  else {
+    die "ERROR ($0): cannot retrieve token bounds from external tokenizer for text '$text_id' => Aborting ...\n";
+  };
+
+  # Add footer
+  return $output . $self->_footer;
+};
+
+
+# Close communication channel
+sub close {
+  my $self = shift;
+  close($self->{chld_in});
+  close($self->{chld_out});
+  $self->{chld_out} = $self->{chld_in} = undef;
+
+  # Close the pid if still open
+  if ($self->{pid}) {
+    waitpid $self->{pid}, 0;
+    $self->{pid} = undef;
+  };
+};
+
+
+1;

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 359fea3..671c26e 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -6,15 +6,12 @@
 use Getopt::Long qw(GetOptions :config no_auto_abbrev);
 
 use File::Basename qw(dirname);
-use IO::Handle;
-use IO::Select;
 
 use open qw(:std :utf8); # assume utf-8 encoding
 use Encode qw(encode_utf8 decode_utf8);
 
 use XML::CompactTree::XS;
 use XML::LibXML::Reader;
-use IPC::Open2 qw(open2);
 
 use FindBin;
 BEGIN {
@@ -22,6 +19,7 @@
 };
 
 use KorAP::XML::TEI;
+use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Zipper;
@@ -35,7 +33,8 @@
 GetOptions(
   "root|r=s"  => \(my $_root_dir = '.'),  # name of root directory inside zip file
   "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
-  'help|h'   => sub {
+  'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
+  'help|h'    => sub {
     pod2usage(
       -verbose => 99,
       -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
@@ -76,19 +75,26 @@
 # ~~~ constants ~~~
 #
 
-## DEPRECATED (only IDS-intern - the tokenization is normally done by external tools)
-my $_GEN_TOK_BAS               = 0;      # IDS internal tokenization
-  my( $chld_out, $chld_in, $pid, $select );
+my $_GEN_TOK_EXT = $tokenizer_call ? 1 : 0;      # (used for IDS internal tokenization)
+
+  # TODO:
+  #   Read tokenizer call from configuration file.
+  #   was 'java  -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar")). " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl";
+  my $ext_tok;
+  if ($tokenizer_call) {
+    $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
+  };
+  my $_tok_file_ext = "tokens.xml";
 ##
 
-## dummy tokenization (only for testing)
-my $_GEN_TOK_DUMMY             = 1;      # use dummy base tokenization for testing (base tokenization is normally done by external tools)
+## intern tokenization
+my $_GEN_TOK_INT               = 1;      # this simple tokenization can be used for testing (base tokenization is normally done by external tools)
   my $_tok_file_con            = "tokens_conservative.xml";
   my $_tok_file_agg            = "tokens_aggressive.xml";
   my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
   my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
   my ( $txt, $offset );
-my $_base_tokenization_dir     = "base"; # name of directory for storing files of dummy tokenization (only used in func. select_tokenization)
+my $_tok_dir         = "base"; # name of directory for storing tokenization files
 
 my $_DEBUG           = 0;                            # set to 1 for minimal more debug output (no need to be parametrized)
 my $_XCT_LN          = 0;                            # only for debugging: include line numbers in elements of $tree_data
@@ -185,10 +191,6 @@
 
 my ( $i, $c );                                       # index variables used in loops
 
-## DEPRECATED (only IDS-intern)
-my $_tok_file_bas = "tokens.xml";
-##
-
 my ( $_CORP_HEADER_END, $_DOC_HEADER_END, $_TEXT_HEADER_END );
 
 
@@ -232,10 +234,6 @@
 $data_sfx     = "</text>\n</raw_text>";
 
 
-## DEPRECATED (only IDS-intern)
-startTokenizer() if $_GEN_TOK_BAS;
-##
-
 # ~ read input and write output (text by text) ~
 process();
 
@@ -289,7 +287,7 @@
       # ~ end of text body ~
 
 
-      # write data.xml, structure.xml and evtl. morpho.xml and/or the dummy tokenization files (s.a.: $_tok_file_con and $_tok_file_agg)
+      # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
 
       $pfx = $1; $sfx = $2;
 
@@ -333,13 +331,11 @@
 
         $data = encode_utf8( $data );
 
-        ## DEPRECATED (only IDS-intern)
-        # first write it to tokenization pipe to give it some time
-        if ( $_GEN_TOK_BAS ){
-          print $chld_in "$data\n\x03\n";
+        if ( $_GEN_TOK_EXT ){
+          # TODO: $offset is only necessary for $cons_tok and $aggr_tok and as long as they're part of 'retr_info'
+          $ext_tok->tokenize($data, $offset);
         }
-        ##
-      
+
         print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
 
 
@@ -358,13 +354,13 @@
         write_tokens() if $_TOKENS_PROC && @tokens;
 
 
-        # ~ dummy tokenization ~
+        # ~ tokenization ~
 
-        if ( $_GEN_TOK_BAS || $_GEN_TOK_DUMMY ){ ## DEPRECATED ($_GEN_TOK_BAS: only IDS-intern)
+        if ( $_GEN_TOK_EXT || $_GEN_TOK_INT ){
 
           select_tokenization();
 
-          if ( $_GEN_TOK_DUMMY ){
+          if ( $_GEN_TOK_INT ){
             $offset = 0;
             $aggr_tok->reset;
             $cons_tok->reset;
@@ -661,12 +657,9 @@
 
   $zipper->close;
 
-  ## DEPRECATED (only IDS-intern)
-  if( $_GEN_TOK_BAS ){
-    close($chld_in);
-    close($chld_out);
+  if( $_GEN_TOK_EXT ){
+    $ext_tok->close;
   }
-  ##
 
 } # end: sub process
 
@@ -1000,16 +993,17 @@
 
 
       #~~~~~
-      # from here (until end): dummy tokenization
+      # from here (until end): intern tokenization
       #~~~~~
 
-      if ( $_GEN_TOK_DUMMY ){
+      if ( $_GEN_TOK_INT ){
 
         $txt = $e->[1];
 
 
         if ( substr( $txt, 0, 1 ) ne ' ' || substr( $txt, 1, 1) ne ' ' ){ # $txt has at least 2 chars, if it's not empty or equal to ' '
 
+          # TODO: implement outside retr_info() (like $ext_tok) on whole $data, instead on every text-node (more efficient and $offset not needed anymore)
           $cons_tok->tokenize($txt, $offset);
           $aggr_tok->tokenize($txt, $offset);
 
@@ -1017,7 +1011,7 @@
 
         } # fi
 
-      } # fi: $_GEN_TOK_DUMMY
+      } # fi: $_GEN_TOK_INT
 
 
     #elsif ( $e->[0] == XML_READER_TYPE_ATTRIBUTE ) # attribute node
@@ -1038,35 +1032,22 @@
 
   #print STDERR "$0: select_tokenization() ...\n";
 
-  ## DEPRECATED (only IDS-intern)
-  if( $_GEN_TOK_BAS ) {
-    if( $select->can_read(3600) ){ # wait 60m for external tokenizer
-      $_ = <$chld_out>;
-      my @bounds = split;
-      write_tokenization("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_bas", $text_id_esc, \@bounds);
-      while($select->can_read(0)) {
-        $_ = <$chld_out>;
-        if (defined $_ && $_ ne '') {
-          print STDERR "WARNING: extra output: $_\n"
-        } else {
-          print STDERR "WARNING: tokenizer seems to have crashed, restarting.\n";
-          startTokenizer();
-        }
-      }
-    }else{
-      $zipper->close;
-      die "ERROR ($0): cannot retrieve token bounds from external tokenizer for text '$text_id' => Aborting ...\n";
-    }
-  ## 
-  }elsif( $_GEN_TOK_DUMMY ){
+  if( $_GEN_TOK_EXT ) {
+
+    $ext_tok->to_zip(
+      $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
+      $text_id_esc
+    );
+
+  }elsif( $_GEN_TOK_INT ){
 
     # Output token streams to zip streams
     $cons_tok->to_zip(
-      $zipper->new_stream("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_con"),
+      $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
       $text_id_esc
     );
     $aggr_tok->to_zip(
-      $zipper->new_stream("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_agg"),
+      $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
       $text_id_esc
     );
   }
@@ -1076,30 +1057,6 @@
 } # end: select_tokenization
 
 
-sub write_tokenization { # called from select_tokenization()
-
-  my ( $fname, $textid_esc, $bounds ) = @_;
-
-  $output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
-    ." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\"$text_id_esc\" xmlns=\"http://ids-mannheim.de/ns/KorAP\""
-    ." version=\"KorAP-0.4\">\n  <spanList>\n";
-
-  $c = 0;
-
-  for( $i = 0; $i < ($#$bounds + 1); $i +=  2 ){
-
-    $output .= "    <span id=\"t_$c\" from=\"".$bounds->[$i]."\" to=\"".$bounds->[$i+1]."\" />\n";
-
-    $c++;
-  }
-
-  $output .= "  </spanList>\n</layer>";
-
-  $zipper->new_stream($fname)->print($output);
-
-} # end: sub write_tokenization
-
-
 sub write_structures { # called from process()
 
   # ~ write @structures ~
@@ -1279,14 +1236,6 @@
 } # end: sub write_tokens
 
 
-## DEPRECATED ($_GEN_TOK_BAS: only IDS-intern)
-sub startTokenizer {
-  $pid = open2($chld_out, $chld_in, 'java  -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar"))." de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl");
-  $select = IO::Select->new();
-  $select->add(*$chld_out);
-}
-##
-
 __END__
 
 =pod

diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
new file mode 100644
index 0000000..e484160
--- /dev/null
+++ b/t/cmd/tokenizer.pl

@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../../lib";
+};
+use KorAP::XML::TEI::Tokenizer::Aggressive;
+
+use open qw(:std :utf8); # assume utf-8 encoding
+
+$| = 1;
+
+# Init tokenizer
+my $tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+
+# Read lines from input and return boundaries
+while (!eof(STDIN)) {
+  my $line = <>;
+  $tok->tokenize($line);
+  print join(' ', $tok->boundaries), "\n";
+  $tok->reset;
+};
+
+1;

diff --git a/t/script.t b/t/script.t
index 3ac91d1..5010789 100644
--- a/t/script.t
+++ b/t/script.t

@@ -32,6 +32,8 @@
 my $outzip = tmpnam();
 
 # Generate zip file (unportable!)
+# TODO:
+#   Call with aggressive and conservative tokenizations!
 stderr_like(
   sub { `cat '$file' | perl '$script' > '$outzip'` },
   qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
@@ -142,7 +144,37 @@
 # Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
 $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
 
-# Read GOE/AGA/00000/base/tok.xml
+$tokens_xml = '';
+$tokens_xml .= $zip->getline while !$zip->eof;
+ok($zip->close, 'Closed');
+
+$t = Test::XML::Loy->new($tokens_xml);
+$t->attr_is('spanList span:nth-child(1)', 'to', 8);
+
+$t->attr_is('spanList span#t_1', 'from', 9);
+$t->attr_is('spanList span#t_1', 'to', 11);
+
+$t->attr_is('spanList span#t_67', 'from', 427);
+$t->attr_is('spanList span#t_67', 'to', 430);
+
+$t->attr_is('spanList span#t_214', 'from', 1209);
+$t->attr_is('spanList span#t_214', 'to', 1212);
+
+$t->element_count_is('spanList span', 227);
+
+# Tokenize with external tokenizer
+my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+stderr_like(
+  sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip'` },
+  qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
+  'Processing'
+);
+
+# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
+$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
+
+# Read GOE/AGA/00000/base/tokens.xml
 $tokens_xml = '';
 $tokens_xml .= $zip->getline while !$zip->eof;
 ok($zip->close, 'Closed');

diff --git a/t/tokenization-external.t b/t/tokenization-external.t
new file mode 100644
index 0000000..e867aed
--- /dev/null
+++ b/t/tokenization-external.t

@@ -0,0 +1,51 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use Data::Dumper;
+use File::Spec::Functions qw/catfile/;
+use File::Temp 'tempfile';
+use Test::XML::Loy;
+
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../lib";
+};
+
+require_ok('KorAP::XML::TEI::Tokenizer::External');
+
+my $f = dirname(__FILE__);
+my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+# Test aggressive
+my $ext = KorAP::XML::TEI::Tokenizer::External->new(
+  'perl ' . $cmd
+  #  'java -cp Ingestion/target/KorAP-Ingestion-pipeline.jar de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl'
+);
+
+$ext->tokenize("Der alte Mann");
+# TODO:
+#   see comments on $sep in 'lib/KorAP/XML/TEI/Tokenizer/External.pm'
+#$ext->tokenize("ging über die Straße");
+
+my $str = $ext->to_string('unknown');
+my $t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->element_count_is('layer spanList span', 3);
+
+$ext->reset;
+$ext->tokenize("Hu aha\ndas ist cool");
+
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
+$t->element_count_is('layer spanList span', 2);
+
+
+done_testing;
commit	8b511f932a7e0f5d3869525b69006466b82c488d	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Jul 09 17:28:08 2020 +0200
committer	Peter Harders <harders@ids-mannheim.de>	Fri Jul 10 20:52:27 2020 +0200
tree	76ace32ccfecb3239b371e5131c91aadaaf321e0
parent	d962747a4ac7e02a6040fad736e4a8a45a6b4431 [diff]