Establish tokenizer objects for aggressive and conservative base tokenization Change-Id: I702098185b0b6292c73217268d4516d55a2f95b5

commit: d962747a4ac7e02a6040fad736e4a8a45a6b4431 [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Jul 09 16:53:09 2020 +0200
committer: Akron <nils@diewald-online.de> Thu Jul 09 17:01:06 2020 +0200
tree: 9f39e305982ab20660aaa09b669b8accfe00ddba
parent: 95612c3d5a2aeea3d81915b7a846a8dc69d46df4 [diff] [blame]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 4f2035e..359fea3 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -22,7 +22,8 @@
 };
 
 use KorAP::XML::TEI;
-use KorAP::XML::TEI::Tokenization;
+use KorAP::XML::TEI::Tokenizer::Conservative;
+use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Zipper;
 
 our $VERSION = '0.01';
@@ -84,7 +85,9 @@
 my $_GEN_TOK_DUMMY             = 1;      # use dummy base tokenization for testing (base tokenization is normally done by external tools)
   my $_tok_file_con            = "tokens_conservative.xml";
   my $_tok_file_agg            = "tokens_aggressive.xml";
-  my ( @tok_tokens_con, @tok_tokens_agg, $txt, $offset );
+  my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+  my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
+  my ( $txt, $offset );
 my $_base_tokenization_dir     = "base"; # name of directory for storing files of dummy tokenization (only used in func. select_tokenization)
 
 my $_DEBUG           = 0;                            # set to 1 for minimal more debug output (no need to be parametrized)
@@ -362,7 +365,9 @@
           select_tokenization();
 
           if ( $_GEN_TOK_DUMMY ){
-            $offset = 0; @tok_tokens_con=(); @tok_tokens_agg=();
+            $offset = 0;
+            $aggr_tok->reset;
+            $cons_tok->reset;
           }
         }
 
@@ -1005,13 +1010,8 @@
 
         if ( substr( $txt, 0, 1 ) ne ' ' || substr( $txt, 1, 1) ne ' ' ){ # $txt has at least 2 chars, if it's not empty or equal to ' '
 
-          my $tok = KorAP::XML::TEI::Tokenization::conservative($txt, $offset);
-          push @tok_tokens_con, @$tok;
-
-          $tok = KorAP::XML::TEI::Tokenization::aggressive($txt, $offset);
-          push @tok_tokens_agg, @$tok;
-
-          ##$offset = $dl+1;
+          $cons_tok->tokenize($txt, $offset);
+          $aggr_tok->tokenize($txt, $offset);
 
           $offset = $dl;
 
@@ -1059,14 +1059,23 @@
     }
   ## 
   }elsif( $_GEN_TOK_DUMMY ){
-    write_tokenization("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_con", $text_id_esc, \@tok_tokens_con);
-    write_tokenization("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_agg", $text_id_esc, \@tok_tokens_agg);
+
+    # Output token streams to zip streams
+    $cons_tok->to_zip(
+      $zipper->new_stream("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_con"),
+      $text_id_esc
+    );
+    $aggr_tok->to_zip(
+      $zipper->new_stream("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_agg"),
+      $text_id_esc
+    );
   }
 
   #print STDERR "$0: write_tokenization(): DONE\n";
 
 } # end: select_tokenization
 
+
 sub write_tokenization { # called from select_tokenization()
 
   my ( $fname, $textid_esc, $bounds ) = @_;
commit	d962747a4ac7e02a6040fad736e4a8a45a6b4431	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Jul 09 16:53:09 2020 +0200
committer	Akron <nils@diewald-online.de>	Thu Jul 09 17:01:06 2020 +0200
tree	9f39e305982ab20660aaa09b669b8accfe00ddba
parent	95612c3d5a2aeea3d81915b7a846a8dc69d46df4 [diff] [blame]