Establish tokenizer object for external base tokenization Change-Id: Ie69c280042da5125e0934c87ccaad88b0be5494f

commit: 8b511f932a7e0f5d3869525b69006466b82c488d [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Jul 09 17:28:08 2020 +0200
committer: Peter Harders <harders@ids-mannheim.de> Fri Jul 10 20:52:27 2020 +0200
tree: 76ace32ccfecb3239b371e5131c91aadaaf321e0
parent: d962747a4ac7e02a6040fad736e4a8a45a6b4431 [diff] [blame]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 359fea3..671c26e 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -6,15 +6,12 @@
 use Getopt::Long qw(GetOptions :config no_auto_abbrev);
 
 use File::Basename qw(dirname);
-use IO::Handle;
-use IO::Select;
 
 use open qw(:std :utf8); # assume utf-8 encoding
 use Encode qw(encode_utf8 decode_utf8);
 
 use XML::CompactTree::XS;
 use XML::LibXML::Reader;
-use IPC::Open2 qw(open2);
 
 use FindBin;
 BEGIN {
@@ -22,6 +19,7 @@
 };
 
 use KorAP::XML::TEI;
+use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Zipper;
@@ -35,7 +33,8 @@
 GetOptions(
   "root|r=s"  => \(my $_root_dir = '.'),  # name of root directory inside zip file
   "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
-  'help|h'   => sub {
+  'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
+  'help|h'    => sub {
     pod2usage(
       -verbose => 99,
       -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
@@ -76,19 +75,26 @@
 # ~~~ constants ~~~
 #
 
-## DEPRECATED (only IDS-intern - the tokenization is normally done by external tools)
-my $_GEN_TOK_BAS               = 0;      # IDS internal tokenization
-  my( $chld_out, $chld_in, $pid, $select );
+my $_GEN_TOK_EXT = $tokenizer_call ? 1 : 0;      # (used for IDS internal tokenization)
+
+  # TODO:
+  #   Read tokenizer call from configuration file.
+  #   was 'java  -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar")). " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl";
+  my $ext_tok;
+  if ($tokenizer_call) {
+    $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
+  };
+  my $_tok_file_ext = "tokens.xml";
 ##
 
-## dummy tokenization (only for testing)
-my $_GEN_TOK_DUMMY             = 1;      # use dummy base tokenization for testing (base tokenization is normally done by external tools)
+## intern tokenization
+my $_GEN_TOK_INT               = 1;      # this simple tokenization can be used for testing (base tokenization is normally done by external tools)
   my $_tok_file_con            = "tokens_conservative.xml";
   my $_tok_file_agg            = "tokens_aggressive.xml";
   my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
   my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
   my ( $txt, $offset );
-my $_base_tokenization_dir     = "base"; # name of directory for storing files of dummy tokenization (only used in func. select_tokenization)
+my $_tok_dir         = "base"; # name of directory for storing tokenization files
 
 my $_DEBUG           = 0;                            # set to 1 for minimal more debug output (no need to be parametrized)
 my $_XCT_LN          = 0;                            # only for debugging: include line numbers in elements of $tree_data
@@ -185,10 +191,6 @@
 
 my ( $i, $c );                                       # index variables used in loops
 
-## DEPRECATED (only IDS-intern)
-my $_tok_file_bas = "tokens.xml";
-##
-
 my ( $_CORP_HEADER_END, $_DOC_HEADER_END, $_TEXT_HEADER_END );
 
 
@@ -232,10 +234,6 @@
 $data_sfx     = "</text>\n</raw_text>";
 
 
-## DEPRECATED (only IDS-intern)
-startTokenizer() if $_GEN_TOK_BAS;
-##
-
 # ~ read input and write output (text by text) ~
 process();
 
@@ -289,7 +287,7 @@
       # ~ end of text body ~
 
 
-      # write data.xml, structure.xml and evtl. morpho.xml and/or the dummy tokenization files (s.a.: $_tok_file_con and $_tok_file_agg)
+      # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
 
       $pfx = $1; $sfx = $2;
 
@@ -333,13 +331,11 @@
 
         $data = encode_utf8( $data );
 
-        ## DEPRECATED (only IDS-intern)
-        # first write it to tokenization pipe to give it some time
-        if ( $_GEN_TOK_BAS ){
-          print $chld_in "$data\n\x03\n";
+        if ( $_GEN_TOK_EXT ){
+          # TODO: $offset is only necessary for $cons_tok and $aggr_tok and as long as they're part of 'retr_info'
+          $ext_tok->tokenize($data, $offset);
         }
-        ##
-      
+
         print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
 
 
@@ -358,13 +354,13 @@
         write_tokens() if $_TOKENS_PROC && @tokens;
 
 
-        # ~ dummy tokenization ~
+        # ~ tokenization ~
 
-        if ( $_GEN_TOK_BAS || $_GEN_TOK_DUMMY ){ ## DEPRECATED ($_GEN_TOK_BAS: only IDS-intern)
+        if ( $_GEN_TOK_EXT || $_GEN_TOK_INT ){
 
           select_tokenization();
 
-          if ( $_GEN_TOK_DUMMY ){
+          if ( $_GEN_TOK_INT ){
             $offset = 0;
             $aggr_tok->reset;
             $cons_tok->reset;
@@ -661,12 +657,9 @@
 
   $zipper->close;
 
-  ## DEPRECATED (only IDS-intern)
-  if( $_GEN_TOK_BAS ){
-    close($chld_in);
-    close($chld_out);
+  if( $_GEN_TOK_EXT ){
+    $ext_tok->close;
   }
-  ##
 
 } # end: sub process
 
@@ -1000,16 +993,17 @@
 
 
       #~~~~~
-      # from here (until end): dummy tokenization
+      # from here (until end): intern tokenization
       #~~~~~
 
-      if ( $_GEN_TOK_DUMMY ){
+      if ( $_GEN_TOK_INT ){
 
         $txt = $e->[1];
 
 
         if ( substr( $txt, 0, 1 ) ne ' ' || substr( $txt, 1, 1) ne ' ' ){ # $txt has at least 2 chars, if it's not empty or equal to ' '
 
+          # TODO: implement outside retr_info() (like $ext_tok) on whole $data, instead on every text-node (more efficient and $offset not needed anymore)
           $cons_tok->tokenize($txt, $offset);
           $aggr_tok->tokenize($txt, $offset);
 
@@ -1017,7 +1011,7 @@
 
         } # fi
 
-      } # fi: $_GEN_TOK_DUMMY
+      } # fi: $_GEN_TOK_INT
 
 
     #elsif ( $e->[0] == XML_READER_TYPE_ATTRIBUTE ) # attribute node
@@ -1038,35 +1032,22 @@
 
   #print STDERR "$0: select_tokenization() ...\n";
 
-  ## DEPRECATED (only IDS-intern)
-  if( $_GEN_TOK_BAS ) {
-    if( $select->can_read(3600) ){ # wait 60m for external tokenizer
-      $_ = <$chld_out>;
-      my @bounds = split;
-      write_tokenization("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_bas", $text_id_esc, \@bounds);
-      while($select->can_read(0)) {
-        $_ = <$chld_out>;
-        if (defined $_ && $_ ne '') {
-          print STDERR "WARNING: extra output: $_\n"
-        } else {
-          print STDERR "WARNING: tokenizer seems to have crashed, restarting.\n";
-          startTokenizer();
-        }
-      }
-    }else{
-      $zipper->close;
-      die "ERROR ($0): cannot retrieve token bounds from external tokenizer for text '$text_id' => Aborting ...\n";
-    }
-  ## 
-  }elsif( $_GEN_TOK_DUMMY ){
+  if( $_GEN_TOK_EXT ) {
+
+    $ext_tok->to_zip(
+      $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
+      $text_id_esc
+    );
+
+  }elsif( $_GEN_TOK_INT ){
 
     # Output token streams to zip streams
     $cons_tok->to_zip(
-      $zipper->new_stream("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_con"),
+      $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
       $text_id_esc
     );
     $aggr_tok->to_zip(
-      $zipper->new_stream("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_agg"),
+      $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
       $text_id_esc
     );
   }
@@ -1076,30 +1057,6 @@
 } # end: select_tokenization
 
 
-sub write_tokenization { # called from select_tokenization()
-
-  my ( $fname, $textid_esc, $bounds ) = @_;
-
-  $output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
-    ." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\"$text_id_esc\" xmlns=\"http://ids-mannheim.de/ns/KorAP\""
-    ." version=\"KorAP-0.4\">\n  <spanList>\n";
-
-  $c = 0;
-
-  for( $i = 0; $i < ($#$bounds + 1); $i +=  2 ){
-
-    $output .= "    <span id=\"t_$c\" from=\"".$bounds->[$i]."\" to=\"".$bounds->[$i+1]."\" />\n";
-
-    $c++;
-  }
-
-  $output .= "  </spanList>\n</layer>";
-
-  $zipper->new_stream($fname)->print($output);
-
-} # end: sub write_tokenization
-
-
 sub write_structures { # called from process()
 
   # ~ write @structures ~
@@ -1279,14 +1236,6 @@
 } # end: sub write_tokens
 
 
-## DEPRECATED ($_GEN_TOK_BAS: only IDS-intern)
-sub startTokenizer {
-  $pid = open2($chld_out, $chld_in, 'java  -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar"))." de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl");
-  $select = IO::Select->new();
-  $select->add(*$chld_out);
-}
-##
-
 __END__
 
 =pod
commit	8b511f932a7e0f5d3869525b69006466b82c488d	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Jul 09 17:28:08 2020 +0200
committer	Peter Harders <harders@ids-mannheim.de>	Fri Jul 10 20:52:27 2020 +0200
tree	76ace32ccfecb3239b371e5131c91aadaaf321e0
parent	d962747a4ac7e02a6040fad736e4a8a45a6b4431 [diff] [blame]